Date: (Wed) Apr 08, 2015
Data: Source: Training: https://courses.edx.org/c4x/MITx/15.071x_2/asset/ClaimsData.csv.zip
New:
Time period:
Based on analysis utilizing <> techniques,
extensions toward multiclass classification are scheduled for the next release
glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented
Use plot.ly for interactive plots ?
rm(list=ls())
set.seed(12345)
options(stringsAsFactors=FALSE)
source("~/Dropbox/datascience/R/mydsutils.R")
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
# Gather all package requirements here
#suppressPackageStartupMessages(require())
#packageVersion("caret")
#require(sos); findFn("pinv", maxPages=2, sortby="MaxScore")
# Analysis control global variables
glb_trnng_url <- "https://courses.edx.org/c4x/MITx/15.071x_2/asset/ClaimsData.csv.zip"
glb_newdt_url <- "<newdt_url>"
glb_is_separate_newent_dataset <- FALSE # or TRUE
glb_split_entity_newent_datasets <- TRUE # or FALSE
glb_split_newdata_method <- "sample" # "condition" or "sample"
glb_split_newdata_condition <- "<col_name> <condition_operator> <value>" # or NULL
glb_split_newdata_size_ratio <- 0.4 # > 0 & < 1
glb_split_sample.seed <- 88 # or any integer
glb_max_obs <- 5000 # or NULL
glb_is_regression <- FALSE; glb_is_classification <- TRUE
glb_rsp_var_raw <- "bucket2009"
# for classification, the response variable has to be a factor
# especially for random forests (method="rf")
glb_rsp_var <- "bucket2009.fctr" # or glb_rsp_var_raw
# if the response factor is based on numbers e.g (0/1 vs. "A"/"B"),
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- function(raw) {
as.factor(paste0("B", raw))
} # or NULL
#glb_map_rsp_raw_to_var(c(1, 2, 3, 4, 5))
glb_map_rsp_var_to_raw <- function(var) {
as.numeric(var)
} # or NULL
#glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(c(1, 2, 3, 4, 5)))
if ((glb_rsp_var != glb_rsp_var_raw) & is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
glb_rsp_var_out <- paste0(glb_rsp_var, ".prediction")
glb_id_vars <- NULL # or c("<id_var>")
glb_exclude_vars_as_features <- union(glb_id_vars, c(glb_rsp_var_raw, ".rnorm"))
# List transformed vars
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features,
NULL) # or c("<col_name>")
# List feats that shd be excluded due to known causation by prediction variable
glb_exclude_vars_as_features <- union(union(glb_exclude_vars_as_features,
paste(glb_rsp_var_out, c("", ".proba"), sep="")),
c("reimbursement2009")) # or NULL
glb_impute_na_data <- FALSE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_fin_mdl <- glb_sel_mdl <- glb_dmy_mdl <- NULL;
# Classification
#glb_models_method_vctr <- c("glm", "rpart", "rf") # Binomials
#glb_models_method_vctr <- c("rpart", "rf") # Multinomials
glb_models_method_vctr <- c("rpart") # Multinomials - this exercise
glb_models_lst <- list(); glb_models_df <- data.frame()
glb_loss_mtrx <- matrix(c(0,1,2,3,4,
2,0,1,2,3,
4,2,0,1,2,
6,4,2,0,1,
8,6,4,2,0
), byrow=TRUE, nrow=5) # or NULL
glb_loss_smmry_old <- function(data, lev=NULL, model=NULL) {
confusion_df <- mycompute_confusion_df(data, "obs", "pred")
confusion_mtrx <- as.matrix(confusion_df[, -1])
confusion_mtrx <- cbind(confusion_mtrx,
matrix(rep(0,
(nrow(confusion_mtrx) - ncol(confusion_mtrx)) * nrow(confusion_mtrx)),
byrow=TRUE, nrow=nrow(confusion_mtrx)))
metric <- sum(confusion_mtrx * glb_loss_mtrx) / nrow(data)
names(metric) <- "loss.error"
return(metric)
}
glb_loss_smmry <- function(data, lev=NULL, model=NULL) {
confusion_mtrx <- as.matrix(confusionMatrix(data$obs, data$pred))
print(confusion_mtrx)
metric <- sum(confusion_mtrx * glb_loss_mtrx) / nrow(data)
names(metric) <- "loss.error"
return(metric)
}
glb_loss_smmry_t <- function(data, lev=NULL, model=NULL) {
confusion_mtrx <- as.matrix(confusionMatrix(data$obs, data$pred))
print(confusion_mtrx)
metric <- sum(confusion_mtrx * t(glb_loss_mtrx)) / nrow(data)
names(metric) <- "loss.error"
return(metric)
}
glb_tune_models_df <-
rbind(
data.frame(parameter="cp", min=0.00, max=0.04, by=0.01),
data.frame(parameter="mtry", min=2, max=4, by=1)
)
# or NULL
glb_n_cv_folds <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Baseline prediction model feature(s)
glb_bsl_mdl_var <- c("bucket2008") # or NULL
# Depict process
glb_analytics_pn <- petrinet(name="glb_analytics_pn",
trans_df=data.frame(id=1:6,
name=c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df=data.frame(
begin=c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end =c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_script_tm <- proc.time()
glb_script_df <- data.frame(chunk_label="import_data",
chunk_step_major=1, chunk_step_minor=0,
elapsed=(proc.time() - glb_script_tm)["elapsed"])
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed import_data 1 0 0.002
1: import dataglb_entity_df <- myimport_data(
url=glb_trnng_url,
comment="glb_entity_df", force_header=TRUE,
print_diagn=(glb_is_separate_newent_dataset |
!glb_split_entity_newent_datasets))
## [1] "Reading file ./data/ClaimsData.csv..."
## [1] "dimensions of data in ./data/ClaimsData.csv: 458,005 rows x 16 cols"
if (glb_is_separate_newent_dataset) {
glb_newent_df <- myimport_data(
url=glb_newdt_url,
comment="glb_newent_df", force_header=TRUE, print_diagn=TRUE)
} else {
if (!glb_split_entity_newent_datasets) {
stop("Not implemented yet")
glb_newent_df <- glb_entity_df[sample(1:nrow(glb_entity_df),
max(2, nrow(glb_entity_df) / 1000)),]
} else if (glb_split_newdata_method == "condition") {
glb_newent_df <- do.call("subset",
list(glb_entity_df, parse(text=glb_split_newdata_condition)))
glb_entity_df <- do.call("subset",
list(glb_entity_df, parse(text=paste0("!(",
glb_split_newdata_condition,
")"))))
} else if (glb_split_newdata_method == "sample") {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_entity_df[, glb_rsp_var_raw],
SplitRatio=(1-glb_split_newdata_size_ratio))
glb_newent_df <- glb_entity_df[!split, ]
glb_entity_df <- glb_entity_df[split ,]
} else stop("glb_split_newdata_method should be %in% c('condition', 'sample')")
comment(glb_newent_df) <- "glb_newent_df"
myprint_df(glb_newent_df)
str(glb_newent_df)
if (glb_split_entity_newent_datasets) {
myprint_df(glb_entity_df)
str(glb_entity_df)
}
}
## Loading required package: caTools
## age alzheimers arthritis cancer copd depression diabetes heart.failure
## 3 67 0 0 0 0 0 0 0
## 5 67 0 0 0 0 0 0 0
## 6 68 0 0 0 0 0 0 0
## 8 70 0 0 0 0 0 0 0
## 9 67 0 0 0 0 0 0 0
## 10 67 0 0 0 0 0 0 0
## ihd kidney osteoporosis stroke reimbursement2008 bucket2008
## 3 0 0 0 0 0 1
## 5 0 0 0 0 0 1
## 6 0 0 0 0 0 1
## 8 0 0 0 0 0 1
## 9 0 0 0 0 0 1
## 10 0 0 0 0 0 1
## reimbursement2009 bucket2009
## 3 0 1
## 5 0 1
## 6 0 1
## 8 0 1
## 9 0 1
## 10 0 1
## age alzheimers arthritis cancer copd depression diabetes
## 43967 57 0 0 0 0 0 0
## 70246 70 0 0 0 0 0 0
## 165755 78 0 0 0 0 0 0
## 208131 73 0 1 1 0 0 0
## 319113 87 0 0 0 0 1 1
## 446073 72 1 0 1 0 1 1
## heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 43967 0 0 0 0 0 0
## 70246 0 0 0 0 0 0
## 165755 0 0 0 0 0 140
## 208131 0 0 0 1 0 5680
## 319113 0 1 0 0 0 2800
## 446073 1 1 0 1 1 16030
## bucket2008 reimbursement2009 bucket2009
## 43967 1 0 1
## 70246 1 0 1
## 165755 1 720 1
## 208131 2 1250 1
## 319113 1 3330 2
## 446073 3 28000 4
## age alzheimers arthritis cancer copd depression diabetes
## 457996 60 0 1 0 1 1 1
## 457998 87 0 0 0 1 1 1
## 458001 61 1 0 0 1 1 1
## 458002 90 1 0 0 1 1 1
## 458003 76 0 1 0 1 1 1
## 458005 80 1 0 0 1 1 1
## heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 457996 1 1 0 1 1 11720
## 457998 1 1 1 0 0 27750
## 458001 1 1 1 1 1 15960
## 458002 1 1 1 0 0 26870
## 458003 1 1 1 1 1 89140
## 458005 1 1 1 0 1 38320
## bucket2008 reimbursement2009 bucket2009
## 457996 3 142960 5
## 457998 4 148600 5
## 458001 3 154000 5
## 458002 4 155010 5
## 458003 5 155810 5
## 458005 4 189930 5
## 'data.frame': 183202 obs. of 16 variables:
## $ age : int 67 67 68 70 67 67 56 48 99 68 ...
## $ alzheimers : int 0 0 0 0 0 0 0 0 0 0 ...
## $ arthritis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cancer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ depression : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heart.failure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ihd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kidney : int 0 0 0 0 0 0 0 0 0 0 ...
## $ osteoporosis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ reimbursement2008: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2008 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reimbursement2009: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2009 : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "comment")= chr "glb_newent_df"
## age alzheimers arthritis cancer copd depression diabetes heart.failure
## 1 85 0 0 0 0 0 0 0
## 2 59 0 0 0 0 0 0 0
## 4 52 0 0 0 0 0 0 0
## 7 75 0 0 0 0 0 0 0
## 11 89 0 0 0 0 0 0 0
## 13 74 0 0 0 0 0 0 0
## ihd kidney osteoporosis stroke reimbursement2008 bucket2008
## 1 0 0 0 0 0 1
## 2 0 0 0 0 0 1
## 4 0 0 0 0 0 1
## 7 0 0 0 0 0 1
## 11 0 0 0 0 0 1
## 13 0 0 0 0 0 1
## reimbursement2009 bucket2009
## 1 0 1
## 2 0 1
## 4 0 1
## 7 0 1
## 11 0 1
## 13 0 1
## age alzheimers arthritis cancer copd depression diabetes
## 138659 69 0 0 0 0 0 0
## 168428 74 1 0 0 0 0 1
## 189703 81 0 0 0 0 0 1
## 225640 78 1 0 0 0 1 0
## 382169 77 1 0 0 1 1 1
## 397881 46 1 0 0 0 0 0
## heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 138659 0 0 0 0 0 0
## 168428 0 0 0 1 0 720
## 189703 0 1 0 0 0 690
## 225640 0 0 0 1 0 1540
## 382169 1 1 1 1 1 16400
## 397881 1 1 1 0 0 3700
## bucket2008 reimbursement2009 bucket2009
## 138659 1 380 1
## 168428 1 750 1
## 189703 1 1020 1
## 225640 1 1490 1
## 382169 3 6620 2
## 397881 2 8470 3
## age alzheimers arthritis cancer copd depression diabetes
## 457991 76 0 0 0 1 1 1
## 457992 84 0 0 0 1 0 1
## 457997 73 0 0 0 1 1 1
## 457999 83 1 1 0 1 0 1
## 458000 56 0 1 0 1 1 1
## 458004 82 1 0 0 1 0 1
## heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 457991 1 1 1 1 0 53550
## 457992 1 1 1 0 0 8620
## 457997 1 1 1 1 0 53230
## 457999 1 1 1 1 1 62620
## 458000 1 1 1 1 0 62980
## 458004 1 1 1 1 1 20660
## bucket2008 reimbursement2009 bucket2009
## 457991 4 131960 5
## 457992 3 133500 5
## 457997 4 147760 5
## 457999 5 148860 5
## 458000 5 151880 5
## 458004 4 158800 5
## 'data.frame': 274803 obs. of 16 variables:
## $ age : int 85 59 52 75 89 74 81 86 78 67 ...
## $ alzheimers : int 0 0 0 0 0 0 0 0 0 0 ...
## $ arthritis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ cancer : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ depression : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heart.failure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ihd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ kidney : int 0 0 0 0 0 0 0 0 0 0 ...
## $ osteoporosis : int 0 0 0 0 0 0 0 0 0 0 ...
## $ stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ reimbursement2008: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2008 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ reimbursement2009: int 0 0 0 0 0 0 0 0 0 0 ...
## $ bucket2009 : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "comment")= chr "glb_entity_df"
if (!is.null(glb_max_obs)) {
warning("glb_<ent>_df restricted to glb_max_obs: ", format(glb_max_obs, big.mark=","))
glb_entity_df <- glb_entity_df[split <-
sample.split(glb_entity_df[, glb_rsp_var_raw], SplitRatio=glb_max_obs), ]
glb_newent_df <- glb_newent_df[split <-
sample.split(glb_newent_df[, glb_rsp_var_raw], SplitRatio=glb_max_obs), ]
}
## Warning: glb_<ent>_df restricted to glb_max_obs: 5,000
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="cleanse_data",
chunk_step_major=max(glb_script_df$chunk_step_major)+1,
chunk_step_minor=0,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed import_data 1 0 0.002
## elapsed1 cleanse_data 2 0 6.270
2: cleanse dataglb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="inspectORexplore.data",
chunk_step_major=max(glb_script_df$chunk_step_major),
chunk_step_minor=1,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed1 cleanse_data 2 0 6.270
## elapsed2 inspectORexplore.data 2 1 6.301
2.1: inspect/explore data#print(str(glb_entity_df))
#View(glb_entity_df)
# List info gathered for various columns
# <col_name>: <description>; <notes>
# Create new features that help diagnostics
# Create factors of string variables
str_vars <- sapply(1:length(names(glb_entity_df)),
function(col) ifelse(class(glb_entity_df[, names(glb_entity_df)[col]]) == "character",
names(glb_entity_df)[col], ""))
if (length(str_vars <- setdiff(str_vars[str_vars != ""],
glb_exclude_vars_as_features)) > 0) {
warning("Creating factors of string variables:", paste0(str_vars, collapse=", "))
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, str_vars)
for (var in str_vars) {
glb_entity_df[, paste0(var, ".fctr")] <- factor(glb_entity_df[, var],
as.factor(union(glb_entity_df[, var], glb_newent_df[, var])))
glb_newent_df[, paste0(var, ".fctr")] <- factor(glb_newent_df[, var],
as.factor(union(glb_entity_df[, var], glb_newent_df[, var])))
}
}
# Convert factors to dummy variables
# Build splines require(splines); bsBasis <- bs(training$age, df=3)
add_new_diag_feats <- function(obs_df, obs_twin_df) {
require(plyr)
obs_df <- mutate(obs_df,
# <col_name>.NA=is.na(<col_name>),
# <col_name>.fctr=factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# <col_name>.fctr=relevel(factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# "<ref_val>"),
# <col2_name>.fctr=relevel(factor(ifelse(<col1_name> == <val>, "<oth_val>", "<ref_val>")),
# as.factor(c("R", "<ref_val>")),
# ref="<ref_val>"),
# This doesn't work - use sapply instead
# <col_name>.fctr_num=grep(<col_name>, levels(<col_name>.fctr)),
#
# Date.my=as.Date(strptime(Date, "%m/%d/%y %H:%M")),
# Year=year(Date.my),
# Month=months(Date.my),
# Weekday=weekdays(Date.my)
# <col_name>.log=log(<col.name>),
# <col_name>=<table>[as.character(<col2_name>)],
# <col_name>=as.numeric(<col2_name>),
.rnorm=rnorm(n=nrow(obs_df))
)
# If levels of a factor are different across obs_df & glb_newent_df; predict.glm fails
# Transformations not handled by mutate
# obs_df$<col_name>.fctr.num <- sapply(1:nrow(obs_df),
# function(row_ix) grep(obs_df[row_ix, "<col_name>"],
# levels(obs_df[row_ix, "<col_name>.fctr"])))
print(summary(obs_df))
print(sapply(names(obs_df), function(col) sum(is.na(obs_df[, col]))))
return(obs_df)
}
glb_entity_df <- add_new_diag_feats(glb_entity_df, glb_newent_df)
## Loading required package: plyr
## age alzheimers arthritis cancer
## Min. : 26.0 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.: 67.0 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median : 73.0 Median :0.0000 Median :0.000 Median :0.0000
## Mean : 72.2 Mean :0.1966 Mean :0.153 Mean :0.0592
## 3rd Qu.: 80.0 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.0000
## Max. :100.0 Max. :1.0000 Max. :1.000 Max. :1.0000
## copd depression diabetes heart.failure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1342 Mean :0.2066 Mean :0.3758 Mean :0.2862
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## ihd kidney osteoporosis stroke
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.4218 Mean :0.1624 Mean :0.1802 Mean :0.0402
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## reimbursement2008 bucket2008 reimbursement2009 bucket2009
## Min. : 0 Min. :1.000 Min. : 0 Min. :1.000
## 1st Qu.: 0 1st Qu.:1.000 1st Qu.: 160 1st Qu.:1.000
## Median : 955 Median :1.000 Median : 1505 Median :1.000
## Mean : 3937 Mean :1.433 Mean : 4310 Mean :1.522
## 3rd Qu.: 3080 3rd Qu.:2.000 3rd Qu.: 4180 3rd Qu.:2.000
## Max. :133330 Max. :5.000 Max. :131960 Max. :5.000
## .rnorm
## Min. :-3.323274
## 1st Qu.:-0.709937
## Median :-0.015250
## Mean :-0.009294
## 3rd Qu.: 0.681998
## Max. : 3.675877
## age alzheimers arthritis cancer
## 0 0 0 0
## copd depression diabetes heart.failure
## 0 0 0 0
## ihd kidney osteoporosis stroke
## 0 0 0 0
## reimbursement2008 bucket2008 reimbursement2009 bucket2009
## 0 0 0 0
## .rnorm
## 0
glb_newent_df <- add_new_diag_feats(glb_newent_df, glb_entity_df)
## age alzheimers arthritis cancer
## Min. : 26.00 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 67.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 73.00 Median :0.0000 Median :0.0000 Median :0.0000
## Mean : 72.57 Mean :0.1904 Mean :0.1546 Mean :0.0666
## 3rd Qu.: 81.00 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :100.00 Max. :1.0000 Max. :1.0000 Max. :1.0000
## copd depression diabetes heart.failure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1398 Mean :0.2102 Mean :0.3794 Mean :0.2856
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## ihd kidney osteoporosis stroke
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.000
## Mean :0.4206 Mean :0.1646 Mean :0.1718 Mean :0.048
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.000
## reimbursement2008 bucket2008 reimbursement2009 bucket2009
## Min. : 0 Min. :1.00 Min. : 0 Min. :1.000
## 1st Qu.: 0 1st Qu.:1.00 1st Qu.: 110 1st Qu.:1.000
## Median : 945 Median :1.00 Median : 1525 Median :1.000
## Mean : 4044 Mean :1.44 Mean : 4255 Mean :1.522
## 3rd Qu.: 3130 3rd Qu.:2.00 3rd Qu.: 4170 3rd Qu.:2.000
## Max. :173320 Max. :5.00 Max. :103870 Max. :5.000
## .rnorm
## Min. :-3.190467
## 1st Qu.:-0.697116
## Median :-0.004068
## Mean :-0.005770
## 3rd Qu.: 0.675809
## Max. : 3.411899
## age alzheimers arthritis cancer
## 0 0 0 0
## copd depression diabetes heart.failure
## 0 0 0 0
## ihd kidney osteoporosis stroke
## 0 0 0 0
## reimbursement2008 bucket2008 reimbursement2009 bucket2009
## 0 0 0 0
## .rnorm
## 0
# Histogram of predictor in glb_entity_df & glb_newent_df
print(myplot_histogram(glb_entity_df, glb_rsp_var_raw))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
if (glb_is_classification)
print(table(glb_entity_df[, glb_rsp_var_raw]) / nrow(glb_entity_df))
##
## 1 2 3 4 5
## 0.6712 0.1902 0.0894 0.0434 0.0058
print(myplot_histogram(glb_newent_df, glb_rsp_var_raw))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# Check for duplicates in glb_id_vars
if (length(glb_id_vars) > 0) {
id_vars_dups_df <- subset(id_vars_df <- mycreate_tbl_df(
rbind(glb_entity_df[, glb_id_vars, FALSE], glb_newent_df[, glb_id_vars, FALSE]),
glb_id_vars), .freq > 1)
if (nrow(id_vars_dups_df) > 0) {
warning("Duplicates found in glb_id_vars data:", nrow(id_vars_dups_df))
myprint_df(id_vars_dups_df)
} else {
# glb_id_vars are unique across obs in both glb_<>_df
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_id_vars)
}
}
#pairs(subset(glb_entity_df, select=-c(col_symbol)))
# Check for glb_newent_df & glb_entity_df features range mismatches
# Other diagnostics:
# print(subset(glb_entity_df, <col1_name> == max(glb_entity_df$<col1_name>, na.rm=TRUE) &
# <col2_name> <= mean(glb_entity_df$<col1_name>, na.rm=TRUE)))
# print(glb_entity_df[which.max(glb_entity_df$<col_name>),])
# print(<col_name>_freq_glb_entity_df <- mycreate_tbl_df(glb_entity_df, "<col_name>"))
# print(which.min(table(glb_entity_df$<col_name>)))
# print(which.max(table(glb_entity_df$<col_name>)))
# print(which.max(table(glb_entity_df$<col1_name>, glb_entity_df$<col2_name>)[, 2]))
# print(table(glb_entity_df$<col1_name>, glb_entity_df$<col2_name>))
# print(table(is.na(glb_entity_df$<col1_name>), glb_entity_df$<col2_name>))
# print(table(sign(glb_entity_df$<col1_name>), glb_entity_df$<col2_name>))
# print(mycreate_xtab(glb_entity_df, <col1_name>))
# print(mycreate_xtab(glb_entity_df, c(<col1_name>, <col2_name>)))
# print(<col1_name>_<col2_name>_xtab_glb_entity_df <-
# mycreate_xtab(glb_entity_df, c("<col1_name>", "<col2_name>")))
# <col1_name>_<col2_name>_xtab_glb_entity_df[is.na(<col1_name>_<col2_name>_xtab_glb_entity_df)] <- 0
# print(<col1_name>_<col2_name>_xtab_glb_entity_df <-
# mutate(<col1_name>_<col2_name>_xtab_glb_entity_df,
# <col3_name>=(<col1_name> * 1.0) / (<col1_name> + <col2_name>)))
# print(<col2_name>_min_entity_arr <-
# sort(tapply(glb_entity_df$<col1_name>, glb_entity_df$<col2_name>, min, na.rm=TRUE)))
# print(<col1_name>_na_by_<col2_name>_arr <-
# sort(tapply(glb_entity_df$<col1_name>.NA, glb_entity_df$<col2_name>, mean, na.rm=TRUE)))
# Other plots:
# print(myplot_box(df=glb_entity_df, ycol_names="<col1_name>"))
# print(myplot_box(df=glb_entity_df, ycol_names="<col1_name>", xcol_name="<col2_name>"))
# print(myplot_line(subset(glb_entity_df, Symbol %in% c("KO", "PG")),
# "Date.my", "StockPrice", facet_row_colnames="Symbol") +
# geom_vline(xintercept=as.numeric(as.Date("2003-03-01"))) +
# geom_vline(xintercept=as.numeric(as.Date("1983-01-01")))
# )
# print(myplot_scatter(glb_entity_df, "<col1_name>", "<col2_name>", smooth=TRUE))
# print(myplot_scatter(glb_entity_df, "<col1_name>", "<col2_name>", colorcol_name="<Pred.fctr>"))
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="manage_missing_data",
chunk_step_major=max(glb_script_df$chunk_step_major),
chunk_step_minor=glb_script_df[nrow(glb_script_df), "chunk_step_minor"]+1,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed2 inspectORexplore.data 2 1 6.301
## elapsed3 manage_missing_data 2 2 7.808
2.2: manage missing data# print(sapply(names(glb_entity_df), function(col) sum(is.na(glb_entity_df[, col]))))
# print(sapply(names(glb_newent_df), function(col) sum(is.na(glb_newent_df[, col]))))
# glb_entity_df <- na.omit(glb_entity_df)
# glb_newent_df <- na.omit(glb_newent_df)
# df[is.na(df)] <- 0
# Not refactored into mydsutils.R since glb_*_df might be reassigned
glb_impute_missing_data <- function(entity_df, newent_df) {
if (!glb_is_separate_newent_dataset) {
# Combine entity & newent
union_df <- rbind(mutate(entity_df, .src = "entity"),
mutate(newent_df, .src = "newent"))
union_imputed_df <- union_df[, setdiff(setdiff(names(entity_df),
glb_rsp_var),
glb_exclude_vars_as_features)]
print(summary(union_imputed_df))
require(mice)
set.seed(glb_mice_complete.seed)
union_imputed_df <- complete(mice(union_imputed_df))
print(summary(union_imputed_df))
union_df[, names(union_imputed_df)] <- union_imputed_df[, names(union_imputed_df)]
print(summary(union_df))
# union_df$.rownames <- rownames(union_df)
# union_df <- orderBy(~.rownames, union_df)
#
# imp_entity_df <- myimport_data(
# url="<imputed_trnng_url>",
# comment="imp_entity_df", force_header=TRUE, print_diagn=TRUE)
# print(all.equal(subset(union_df, select=-c(.src, .rownames, .rnorm)),
# imp_entity_df))
# Partition again
glb_entity_df <<- subset(union_df, .src == "entity", select=-c(.src, .rownames))
comment(glb_entity_df) <- "entity_df"
glb_newent_df <<- subset(union_df, .src == "newent", select=-c(.src, .rownames))
comment(glb_newent_df) <- "newent_df"
# Generate summaries
print(summary(entity_df))
print(sapply(names(entity_df), function(col) sum(is.na(entity_df[, col]))))
print(summary(newent_df))
print(sapply(names(newent_df), function(col) sum(is.na(newent_df[, col]))))
} else stop("Not implemented yet")
}
if (glb_impute_na_data) {
if ((sum(sapply(names(glb_entity_df),
function(col) sum(is.na(glb_entity_df[, col])))) > 0) |
(sum(sapply(names(glb_newent_df),
function(col) sum(is.na(glb_newent_df[, col])))) > 0))
glb_impute_missing_data(glb_entity_df, glb_newent_df)
}
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="encode_retype_data",
chunk_step_major=max(glb_script_df$chunk_step_major),
chunk_step_minor=glb_script_df[nrow(glb_script_df), "chunk_step_minor"]+1,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed3 manage_missing_data 2 2 7.808
## elapsed4 encode_retype_data 2 3 8.228
2.3: encode/retype data# map_<col_name>_df <- myimport_data(
# url="<map_url>",
# comment="map_<col_name>_df", print_diagn=TRUE)
# map_<col_name>_df <- read.csv(paste0(getwd(), "/data/<file_name>.csv"), strip.white=TRUE)
# glb_entity_df <- mymap_codes(glb_entity_df, "<from_col_name>", "<to_col_name>",
# map_<to_col_name>_df, map_join_col_name="<map_join_col_name>",
# map_tgt_col_name="<to_col_name>")
# glb_newent_df <- mymap_codes(glb_newent_df, "<from_col_name>", "<to_col_name>",
# map_<to_col_name>_df, map_join_col_name="<map_join_col_name>",
# map_tgt_col_name="<to_col_name>")
# glb_entity_df$<col_name>.fctr <- factor(glb_entity_df$<col_name>,
# as.factor(union(glb_entity_df$<col_name>, glb_newent_df$<col_name>)))
# glb_newent_df$<col_name>.fctr <- factor(glb_newent_df$<col_name>,
# as.factor(union(glb_entity_df$<col_name>, glb_newent_df$<col_name>)))
if (!is.null(glb_map_rsp_raw_to_var)) {
glb_entity_df[, glb_rsp_var] <-
glb_map_rsp_raw_to_var(glb_entity_df[, glb_rsp_var_raw])
mycheck_map_results(mapd_df=glb_entity_df,
from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)
glb_newent_df[, glb_rsp_var] <-
glb_map_rsp_raw_to_var(glb_newent_df[, glb_rsp_var_raw])
mycheck_map_results(mapd_df=glb_newent_df,
from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)
}
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## bucket2009 bucket2009.fctr .n
## 1 1 B1 3356
## 2 2 B2 951
## 3 3 B3 447
## 4 4 B4 217
## 5 5 B5 29
## bucket2009 bucket2009.fctr .n
## 1 1 B1 3356
## 2 2 B2 951
## 3 3 B3 447
## 4 4 B4 217
## 5 5 B5 29
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="extract_features",
chunk_step_major=max(glb_script_df$chunk_step_major)+1,
chunk_step_minor=0,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed4 encode_retype_data 2 3 8.228
## elapsed5 extract_features 3 0 10.942
3: extract features# Create new features that help prediction
# <col_name>.lag.2 <- lag(zoo(glb_entity_df$<col_name>), -2, na.pad=TRUE)
# glb_entity_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
# <col_name>.lag.2 <- lag(zoo(glb_newent_df$<col_name>), -2, na.pad=TRUE)
# glb_newent_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
#
# glb_newent_df[1, "<col_name>.lag.2"] <- glb_entity_df[nrow(glb_entity_df) - 1,
# "<col_name>"]
# glb_newent_df[2, "<col_name>.lag.2"] <- glb_entity_df[nrow(glb_entity_df),
# "<col_name>"]
# glb_entity_df <- mutate(glb_entity_df,
# <new_col_name>=
# )
# glb_newent_df <- mutate(glb_newent_df,
# <new_col_name>=
# )
# print(summary(glb_entity_df))
# print(summary(glb_newent_df))
# print(sapply(names(glb_entity_df), function(col) sum(is.na(glb_entity_df[, col]))))
# print(sapply(names(glb_newent_df), function(col) sum(is.na(glb_newent_df[, col]))))
# print(myplot_scatter(glb_entity_df, "<col1_name>", "<col2_name>", smooth=TRUE))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all","data.new")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="select_features",
chunk_step_major=max(glb_script_df$chunk_step_major)+1,
chunk_step_minor=0,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed5 extract_features 3 0 10.942
## elapsed6 select_features 4 0 12.299
4: select featuresprint(glb_feats_df <-
myselect_features( entity_df=glb_entity_df,
exclude_vars_as_features=glb_exclude_vars_as_features,
rsp_var=glb_rsp_var))
## id cor.y cor.y.abs
## bucket2008 bucket2008 0.46125446 0.46125446
## diabetes diabetes 0.40896269 0.40896269
## ihd ihd 0.40798592 0.40798592
## reimbursement2008 reimbursement2008 0.40427780 0.40427780
## kidney kidney 0.38971036 0.38971036
## heart.failure heart.failure 0.37320420 0.37320420
## copd copd 0.31969314 0.31969314
## depression depression 0.28668189 0.28668189
## alzheimers alzheimers 0.27882540 0.27882540
## arthritis arthritis 0.25224332 0.25224332
## osteoporosis osteoporosis 0.21770017 0.21770017
## cancer cancer 0.19499896 0.19499896
## stroke stroke 0.16603039 0.16603039
## age age 0.02529723 0.02529723
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="remove_correlated_features",
chunk_step_major=max(glb_script_df$chunk_step_major),
chunk_step_minor=glb_script_df[nrow(glb_script_df), "chunk_step_minor"]+1,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor
## elapsed6 select_features 4 0
## elapsed7 remove_correlated_features 4 1
## elapsed
## elapsed6 12.299
## elapsed7 12.506
5: run modelsmax_cor_y_x_var <- subset(glb_feats_df, cor.low == 1)[1, "id"]
# Regression:
if (glb_is_regression) {
# Linear:
myrun_mdl_fn <- myrun_mdl_lm
}
# Classification:
if (glb_is_classification) myrun_mdl_fn <- myrun_mdl_classification
glb_is_binomial <- (length(unique(glb_entity_df[, glb_rsp_var])) <= 2)
# Add dummy model - random variable
ret_lst <- myrun_mdl_fn(indep_vars_vctr=".rnorm",
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
fit_df=glb_entity_df, OOB_df=glb_newent_df,
method=ifelse(glb_is_binomial, "glm", "rpart"))
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
##
## Loading required package: caret
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:survival':
##
## cluster
##
## Loading required package: rpart
## + Fold1: cp=0.0009124
## - Fold1: cp=0.0009124
## + Fold2: cp=0.0009124
## - Fold2: cp=0.0009124
## + Fold3: cp=0.0009124
## - Fold3: cp=0.0009124
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00142 on full training set
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.001419303 0 1
##
## Node number 1: 5000 observations
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058) *
glb_dmy_mdl <- ret_lst[["model"]]
# Highest cor.y
ret_lst <- myrun_mdl_fn(indep_vars_vctr=max_cor_y_x_var,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
fit_df=glb_entity_df, OOB_df=glb_newent_df,
method=ifelse(glb_is_binomial, "glm", "rpart"))
## + Fold1: cp=0
## - Fold1: cp=0
## + Fold2: cp=0
## - Fold2: cp=0
## + Fold3: cp=0
## - Fold3: cp=0
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.000811 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.09549878 0 1.0000000
## 2 0.00081103 1 0.9045012
##
## Variable importance
## bucket2008
## 100
##
## Node number 1: 5000 observations, complexity param=0.09549878
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3724 obs) right son=3 (1276 obs)
## Primary splits:
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
##
## Node number 2: 3724 observations
## predicted class=B1 expected loss=0.1922664 P(node) =0.7448
## class counts: 3008 446 183 78 9
## probabilities: 0.808 0.120 0.049 0.021 0.002
##
## Node number 3: 1276 observations
## predicted class=B2 expected loss=0.604232 P(node) =0.2552
## class counts: 348 505 264 139 20
## probabilities: 0.273 0.396 0.207 0.109 0.016
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) bucket2008< 1.5 3724 716 B1 (0.81 0.12 0.049 0.021 0.0024) *
## 3) bucket2008>=1.5 1276 771 B2 (0.27 0.4 0.21 0.11 0.016) *
# Enhance Highest cor.y model with additions of interaction terms that were
# dropped due to high correlations
if (nrow(subset(glb_feats_df, is.na(cor.low))) > 0) {
# Only glm handles interaction terms (checked that rpart does not)
# This does not work - why ???
# indep_vars_vctr <- ifelse(glb_is_binomial,
# c(max_cor_y_x_var, paste(max_cor_y_x_var,
# subset(glb_feats_df, is.na(cor.low))[, "id"], sep=":")),
# union(max_cor_y_x_var, subset(glb_feats_df, is.na(cor.low))[, "id"]))
if (glb_is_binomial) {
indep_vars_vctr <-
c(max_cor_y_x_var, paste(max_cor_y_x_var,
subset(glb_feats_df, is.na(cor.low))[, "id"], sep=":"))
} else {
indep_vars_vctr <-
union(max_cor_y_x_var, subset(glb_feats_df, is.na(cor.low))[, "id"])
}
ret_lst <- myrun_mdl_fn(indep_vars_vctr,
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_entity_df, OOB_df=glb_newent_df,
method=ifelse(glb_is_binomial, "glm", "rpart"))
}
## + Fold1: cp=0.002839
## - Fold1: cp=0.002839
## + Fold2: cp=0.002839
## - Fold2: cp=0.002839
## + Fold3: cp=0.002839
## - Fold3: cp=0.002839
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00324 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.04866180 0 1.0000000
## 2 0.00324412 2 0.9026764
##
## Variable importance
## reimbursement2008 bucket2008
## 60 40
##
## Node number 1: 5000 observations, complexity param=0.0486618
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3025 obs) right son=3 (1975 obs)
## Primary splits:
## reimbursement2008 < 1545 to the left, improve=464.1939, (0 missing)
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.86, adj=0.646, (0 split)
##
## Node number 2: 3025 observations
## predicted class=B1 expected loss=0.1209917 P(node) =0.605
## class counts: 2659 228 98 37 3
## probabilities: 0.879 0.075 0.032 0.012 0.001
##
## Node number 3: 1975 observations, complexity param=0.0486618
## predicted class=B2 expected loss=0.6339241 P(node) =0.395
## class counts: 697 723 349 180 26
## probabilities: 0.353 0.366 0.177 0.091 0.013
## left son=6 (923 obs) right son=7 (1052 obs)
## Primary splits:
## reimbursement2008 < 3765 to the left, improve=38.75805, (0 missing)
## bucket2008 < 1.5 to the left, improve=30.80855, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.887, adj=0.757, (0 split)
##
## Node number 6: 923 observations
## predicted class=B1 expected loss=0.5178765 P(node) =0.1846
## class counts: 445 311 109 50 8
## probabilities: 0.482 0.337 0.118 0.054 0.009
##
## Node number 7: 1052 observations
## predicted class=B2 expected loss=0.608365 P(node) =0.2104
## class counts: 252 412 240 130 18
## probabilities: 0.240 0.392 0.228 0.124 0.017
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) reimbursement2008< 1545 3025 366 B1 (0.88 0.075 0.032 0.012 0.00099) *
## 3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)
## 6) reimbursement2008< 3765 923 478 B1 (0.48 0.34 0.12 0.054 0.0087) *
## 7) reimbursement2008>=3765 1052 640 B2 (0.24 0.39 0.23 0.12 0.017) *
# Low correlated X
ret_lst <- myrun_mdl_fn(indep_vars_vctr=subset(glb_feats_df,
cor.low == 1)[, "id"],
glb_rsp_var, glb_rsp_var_out,
fit_df=glb_entity_df, OOB_df=glb_newent_df,
method=ifelse(glb_is_binomial, "glm", "rpart"))
## + Fold1: cp=0.002585
## - Fold1: cp=0.002585
## + Fold2: cp=0.002585
## - Fold2: cp=0.002585
## + Fold3: cp=0.002585
## - Fold3: cp=0.002585
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0176 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.09549878 0 1.0000000
## 2 0.01763990 1 0.9045012
##
## Variable importance
## bucket2008 kidney copd heart.failure cancer
## 51 15 13 9 6
## arthritis
## 6
##
## Node number 1: 5000 observations, complexity param=0.09549878
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3724 obs) right son=3 (1276 obs)
## Primary splits:
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
## diabetes < 0.5 to the left, improve=323.2704, (0 missing)
## ihd < 0.5 to the left, improve=319.3596, (0 missing)
## heart.failure < 0.5 to the left, improve=227.0499, (0 missing)
## kidney < 0.5 to the left, improve=213.8953, (0 missing)
## Surrogate splits:
## kidney < 0.5 to the left, agree=0.821, adj=0.299, (0 split)
## copd < 0.5 to the left, agree=0.807, adj=0.245, (0 split)
## heart.failure < 0.5 to the left, agree=0.788, adj=0.168, (0 split)
## cancer < 0.5 to the left, agree=0.774, adj=0.116, (0 split)
## arthritis < 0.5 to the left, agree=0.774, adj=0.114, (0 split)
##
## Node number 2: 3724 observations
## predicted class=B1 expected loss=0.1922664 P(node) =0.7448
## class counts: 3008 446 183 78 9
## probabilities: 0.808 0.120 0.049 0.021 0.002
##
## Node number 3: 1276 observations
## predicted class=B2 expected loss=0.604232 P(node) =0.2552
## class counts: 348 505 264 139 20
## probabilities: 0.273 0.396 0.207 0.109 0.016
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) bucket2008< 1.5 3724 716 B1 (0.81 0.12 0.049 0.021 0.0024) *
## 3) bucket2008>=1.5 1276 771 B2 (0.27 0.4 0.21 0.11 0.016) *
# User specified
for (method in glb_models_method_vctr) {
#print(sprintf("iterating over method:%s", method))
# All X that is not user excluded
indep_vars_vctr <- setdiff(names(glb_entity_df),
union(glb_rsp_var, glb_exclude_vars_as_features))
# easier to exclude features
# indep_vars_vctr <- setdiff(names(glb_entity_df),
# union(union(glb_rsp_var, glb_exclude_vars_as_features),
# c("<feat1_name>", "<feat2_name>")))
# easier to include features
# indep_vars_vctr <- c("<feat1_name>", "<feat2_name>")
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glb_entity_df),
# union(glb_rsp_var, glb_exclude_vars_as_features)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
set.seed(200)
ret_lst <- myrun_mdl_fn(indep_vars_vctr=indep_vars_vctr,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
fit_df=glb_entity_df,
OOB_df=glb_newent_df,
method=method,
tune_models_df=glb_tune_models_df,
n_cv_folds=glb_n_cv_folds)
glb_sel_nlm_mdl <- ret_lst[["model"]]
rpart_sel_nlm_mdl <- rpart(reformulate(indep_vars_vctr, response=glb_rsp_var),
data=glb_entity_df, method="class",
control=rpart.control(cp=glb_sel_nlm_mdl$bestTune$cp))
set.seed(201)
ret_lst <- myrun_mdl_fn(indep_vars_vctr=indep_vars_vctr,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
fit_df=glb_entity_df,
OOB_df=glb_newent_df,
method=method,
tune_models_df=glb_tune_models_df,
n_cv_folds=glb_n_cv_folds,
loss_mtrx=glb_loss_mtrx,
summaryFunction=glb_loss_smmry,
metric="loss.error",
maximize=FALSE)
glb_sel_wlm_mdl <- ret_lst[["model"]]
rpart_sel_wlm_mdl <- rpart(reformulate(indep_vars_vctr, response=glb_rsp_var),
data=glb_entity_df, method="class",
control=rpart.control(cp=glb_sel_wlm_mdl$bestTune$cp),
parms=list(loss=glb_loss_mtrx))
set.seed(201)
ret_lst <- myrun_mdl_fn(indep_vars_vctr=indep_vars_vctr,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
fit_df=glb_entity_df,
OOB_df=glb_newent_df,
method=method,
tune_models_df=glb_tune_models_df,
n_cv_folds=glb_n_cv_folds,
loss_mtrx=t(glb_loss_mtrx),
summaryFunction=glb_loss_smmry_t,
metric="loss.error",
maximize=FALSE)
glb_sel_mdl <- glb_sel_tlm_mdl <- ret_lst[["model"]]
rpart_sel_tlm_mdl <- rpart(reformulate(indep_vars_vctr, response=glb_rsp_var),
data=glb_entity_df, method="class",
control=rpart.control(cp=glb_sel_tlm_mdl$bestTune$cp),
parms=list(loss=glb_loss_mtrx))
}
## + Fold1: cp=0
## - Fold1: cp=0
## + Fold2: cp=0
## - Fold2: cp=0
## + Fold3: cp=0
## - Fold3: cp=0
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.01 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.04866180 0 1.0000000
## 2 0.01094891 2 0.9026764
## 3 0.01000000 3 0.8917275
##
## Variable importance
## reimbursement2008 bucket2008 ihd diabetes
## 31 20 14 13
## heart.failure kidney
## 12 10
##
## Node number 1: 5000 observations, complexity param=0.0486618
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3025 obs) right son=3 (1975 obs)
## Primary splits:
## reimbursement2008 < 1545 to the left, improve=464.1939, (0 missing)
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
## diabetes < 0.5 to the left, improve=323.2704, (0 missing)
## ihd < 0.5 to the left, improve=319.3596, (0 missing)
## heart.failure < 0.5 to the left, improve=227.0499, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.860, adj=0.646, (0 split)
## ihd < 0.5 to the left, agree=0.790, adj=0.467, (0 split)
## diabetes < 0.5 to the left, agree=0.784, adj=0.454, (0 split)
## heart.failure < 0.5 to the left, agree=0.764, adj=0.403, (0 split)
## kidney < 0.5 to the left, agree=0.730, adj=0.316, (0 split)
##
## Node number 2: 3025 observations
## predicted class=B1 expected loss=0.1209917 P(node) =0.605
## class counts: 2659 228 98 37 3
## probabilities: 0.879 0.075 0.032 0.012 0.001
##
## Node number 3: 1975 observations, complexity param=0.0486618
## predicted class=B2 expected loss=0.6339241 P(node) =0.395
## class counts: 697 723 349 180 26
## probabilities: 0.353 0.366 0.177 0.091 0.013
## left son=6 (923 obs) right son=7 (1052 obs)
## Primary splits:
## reimbursement2008 < 3765 to the left, improve=38.75805, (0 missing)
## bucket2008 < 1.5 to the left, improve=30.80855, (0 missing)
## kidney < 0.5 to the left, improve=28.52915, (0 missing)
## ihd < 0.5 to the left, improve=27.61120, (0 missing)
## diabetes < 0.5 to the left, improve=23.99424, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.887, adj=0.757, (0 split)
## kidney < 0.5 to the left, agree=0.660, adj=0.272, (0 split)
## heart.failure < 0.5 to the left, agree=0.639, adj=0.228, (0 split)
## copd < 0.5 to the left, agree=0.622, adj=0.191, (0 split)
## ihd < 0.5 to the left, agree=0.607, adj=0.159, (0 split)
##
## Node number 6: 923 observations, complexity param=0.01094891
## predicted class=B1 expected loss=0.5178765 P(node) =0.1846
## class counts: 445 311 109 50 8
## probabilities: 0.482 0.337 0.118 0.054 0.009
## left son=12 (754 obs) right son=13 (169 obs)
## Primary splits:
## kidney < 0.5 to the left, improve=7.902130, (0 missing)
## diabetes < 0.5 to the left, improve=6.451528, (0 missing)
## ihd < 0.5 to the left, improve=6.445818, (0 missing)
## reimbursement2008 < 2545 to the left, improve=5.585770, (0 missing)
## arthritis < 0.5 to the left, improve=5.444392, (0 missing)
## Surrogate splits:
## reimbursement2008 < 3755 to the left, agree=0.818, adj=0.006, (0 split)
##
## Node number 7: 1052 observations
## predicted class=B2 expected loss=0.608365 P(node) =0.2104
## class counts: 252 412 240 130 18
## probabilities: 0.240 0.392 0.228 0.124 0.017
##
## Node number 12: 754 observations
## predicted class=B1 expected loss=0.4814324 P(node) =0.1508
## class counts: 391 239 79 38 7
## probabilities: 0.519 0.317 0.105 0.050 0.009
##
## Node number 13: 169 observations
## predicted class=B2 expected loss=0.5739645 P(node) =0.0338
## class counts: 54 72 30 12 1
## probabilities: 0.320 0.426 0.178 0.071 0.006
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) reimbursement2008< 1545 3025 366 B1 (0.88 0.075 0.032 0.012 0.00099) *
## 3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)
## 6) reimbursement2008< 3765 923 478 B1 (0.48 0.34 0.12 0.054 0.0087)
## 12) kidney< 0.5 754 363 B1 (0.52 0.32 0.1 0.05 0.0093) *
## 13) kidney>=0.5 169 97 B2 (0.32 0.43 0.18 0.071 0.0059) *
## 7) reimbursement2008>=3765 1052 640 B2 (0.24 0.39 0.23 0.12 0.017) *
## B1 B2 B3 B4 B5
## B1 7 1 1 0 0
## B2 0 0 0 0 0
## B3 0 1 0 0 0
## B4 0 0 0 0 0
## B5 0 0 0 0 0
## + Fold1: cp=0
## B1 B2 B3 B4 B5
## B1 966 125 23 5 0
## B2 167 112 32 6 0
## B3 54 61 23 11 0
## B4 27 27 17 2 0
## B5 2 5 1 1 0
## B1 B2 B3 B4 B5
## B1 1014 105 0 0 0
## B2 170 147 0 0 0
## B3 59 90 0 0 0
## B4 28 45 0 0 0
## B5 1 8 0 0 0
## B1 B2 B3 B4 B5
## B1 1034 85 0 0 0
## B2 187 130 0 0 0
## B3 67 82 0 0 0
## B4 32 41 0 0 0
## B5 2 7 0 0 0
## B1 B2 B3 B4 B5
## B1 1034 85 0 0 0
## B2 187 130 0 0 0
## B3 67 82 0 0 0
## B4 32 41 0 0 0
## B5 2 7 0 0 0
## B1 B2 B3 B4 B5
## B1 1034 85 0 0 0
## B2 187 130 0 0 0
## B3 67 82 0 0 0
## B4 32 41 0 0 0
## B5 2 7 0 0 0
## - Fold1: cp=0
## + Fold2: cp=0
## B1 B2 B3 B4 B5
## B1 975 115 27 2 0
## B2 157 98 54 8 0
## B3 63 56 28 2 0
## B4 29 21 18 4 0
## B5 4 3 2 1 0
## B1 B2 B3 B4 B5
## B1 1010 109 0 0 0
## B2 141 176 0 0 0
## B3 62 87 0 0 0
## B4 28 44 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1034 85 0 0 0
## B2 179 138 0 0 0
## B3 74 75 0 0 0
## B4 32 40 0 0 0
## B5 5 5 0 0 0
## B1 B2 B3 B4 B5
## B1 981 138 0 0 0
## B2 132 185 0 0 0
## B3 54 95 0 0 0
## B4 23 49 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1119 0 0 0 0
## B2 317 0 0 0 0
## B3 149 0 0 0 0
## B4 72 0 0 0 0
## B5 10 0 0 0 0
## - Fold2: cp=0
## + Fold3: cp=0
## B1 B2 B3 B4 B5
## B1 998 86 23 11 0
## B2 170 99 36 12 0
## B3 78 48 15 8 0
## B4 29 24 13 6 0
## B5 2 3 0 5 0
## B1 B2 B3 B4 B5
## B1 1036 82 0 0 0
## B2 193 124 0 0 0
## B3 74 75 0 0 0
## B4 25 47 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1036 82 0 0 0
## B2 193 124 0 0 0
## B3 74 75 0 0 0
## B4 25 47 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1036 82 0 0 0
## B2 193 124 0 0 0
## B3 74 75 0 0 0
## B4 25 47 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1036 82 0 0 0
## B2 193 124 0 0 0
## B3 74 75 0 0 0
## B4 25 47 0 0 0
## B5 4 6 0 0 0
## - Fold3: cp=0
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.01 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.04866180 0 1.0000000
## 2 0.01094891 2 0.9026764
## 3 0.01000000 3 0.8917275
##
## Variable importance
## reimbursement2008 bucket2008 ihd diabetes
## 31 20 14 13
## heart.failure kidney
## 12 10
##
## Node number 1: 5000 observations, complexity param=0.0486618
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3025 obs) right son=3 (1975 obs)
## Primary splits:
## reimbursement2008 < 1545 to the left, improve=464.1939, (0 missing)
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
## diabetes < 0.5 to the left, improve=323.2704, (0 missing)
## ihd < 0.5 to the left, improve=319.3596, (0 missing)
## heart.failure < 0.5 to the left, improve=227.0499, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.860, adj=0.646, (0 split)
## ihd < 0.5 to the left, agree=0.790, adj=0.467, (0 split)
## diabetes < 0.5 to the left, agree=0.784, adj=0.454, (0 split)
## heart.failure < 0.5 to the left, agree=0.764, adj=0.403, (0 split)
## kidney < 0.5 to the left, agree=0.730, adj=0.316, (0 split)
##
## Node number 2: 3025 observations
## predicted class=B1 expected loss=0.1209917 P(node) =0.605
## class counts: 2659 228 98 37 3
## probabilities: 0.879 0.075 0.032 0.012 0.001
##
## Node number 3: 1975 observations, complexity param=0.0486618
## predicted class=B2 expected loss=0.6339241 P(node) =0.395
## class counts: 697 723 349 180 26
## probabilities: 0.353 0.366 0.177 0.091 0.013
## left son=6 (923 obs) right son=7 (1052 obs)
## Primary splits:
## reimbursement2008 < 3765 to the left, improve=38.75805, (0 missing)
## bucket2008 < 1.5 to the left, improve=30.80855, (0 missing)
## kidney < 0.5 to the left, improve=28.52915, (0 missing)
## ihd < 0.5 to the left, improve=27.61120, (0 missing)
## diabetes < 0.5 to the left, improve=23.99424, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.887, adj=0.757, (0 split)
## kidney < 0.5 to the left, agree=0.660, adj=0.272, (0 split)
## heart.failure < 0.5 to the left, agree=0.639, adj=0.228, (0 split)
## copd < 0.5 to the left, agree=0.622, adj=0.191, (0 split)
## ihd < 0.5 to the left, agree=0.607, adj=0.159, (0 split)
##
## Node number 6: 923 observations, complexity param=0.01094891
## predicted class=B1 expected loss=0.5178765 P(node) =0.1846
## class counts: 445 311 109 50 8
## probabilities: 0.482 0.337 0.118 0.054 0.009
## left son=12 (754 obs) right son=13 (169 obs)
## Primary splits:
## kidney < 0.5 to the left, improve=7.902130, (0 missing)
## diabetes < 0.5 to the left, improve=6.451528, (0 missing)
## ihd < 0.5 to the left, improve=6.445818, (0 missing)
## reimbursement2008 < 2545 to the left, improve=5.585770, (0 missing)
## arthritis < 0.5 to the left, improve=5.444392, (0 missing)
## Surrogate splits:
## reimbursement2008 < 3755 to the left, agree=0.818, adj=0.006, (0 split)
##
## Node number 7: 1052 observations
## predicted class=B2 expected loss=0.608365 P(node) =0.2104
## class counts: 252 412 240 130 18
## probabilities: 0.240 0.392 0.228 0.124 0.017
##
## Node number 12: 754 observations
## predicted class=B1 expected loss=0.4814324 P(node) =0.1508
## class counts: 391 239 79 38 7
## probabilities: 0.519 0.317 0.105 0.050 0.009
##
## Node number 13: 169 observations
## predicted class=B2 expected loss=0.5739645 P(node) =0.0338
## class counts: 54 72 30 12 1
## probabilities: 0.320 0.426 0.178 0.071 0.006
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) reimbursement2008< 1545 3025 366 B1 (0.88 0.075 0.032 0.012 0.00099) *
## 3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)
## 6) reimbursement2008< 3765 923 478 B1 (0.48 0.34 0.12 0.054 0.0087)
## 12) kidney< 0.5 754 363 B1 (0.52 0.32 0.1 0.05 0.0093) *
## 13) kidney>=0.5 169 97 B2 (0.32 0.43 0.18 0.071 0.0059) *
## 7) reimbursement2008>=3765 1052 640 B2 (0.24 0.39 0.23 0.12 0.017) *
## B1 B2 B3 B4 B5
## B1 7 1 1 0 0
## B2 0 0 0 0 0
## B3 0 1 0 0 0
## B4 0 0 0 0 0
## B5 0 0 0 0 0
## + Fold1: cp=0
## B1 B2 B3 B4 B5
## B1 966 125 23 5 0
## B2 167 112 32 6 0
## B3 54 61 23 11 0
## B4 27 27 17 2 0
## B5 2 5 1 1 0
## B1 B2 B3 B4 B5
## B1 1014 105 0 0 0
## B2 170 147 0 0 0
## B3 59 90 0 0 0
## B4 28 45 0 0 0
## B5 1 8 0 0 0
## B1 B2 B3 B4 B5
## B1 1034 85 0 0 0
## B2 187 130 0 0 0
## B3 67 82 0 0 0
## B4 32 41 0 0 0
## B5 2 7 0 0 0
## B1 B2 B3 B4 B5
## B1 1034 85 0 0 0
## B2 187 130 0 0 0
## B3 67 82 0 0 0
## B4 32 41 0 0 0
## B5 2 7 0 0 0
## B1 B2 B3 B4 B5
## B1 1034 85 0 0 0
## B2 187 130 0 0 0
## B3 67 82 0 0 0
## B4 32 41 0 0 0
## B5 2 7 0 0 0
## - Fold1: cp=0
## + Fold2: cp=0
## B1 B2 B3 B4 B5
## B1 975 115 27 2 0
## B2 157 98 54 8 0
## B3 63 56 28 2 0
## B4 29 21 18 4 0
## B5 4 3 2 1 0
## B1 B2 B3 B4 B5
## B1 1010 109 0 0 0
## B2 141 176 0 0 0
## B3 62 87 0 0 0
## B4 28 44 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1034 85 0 0 0
## B2 179 138 0 0 0
## B3 74 75 0 0 0
## B4 32 40 0 0 0
## B5 5 5 0 0 0
## B1 B2 B3 B4 B5
## B1 981 138 0 0 0
## B2 132 185 0 0 0
## B3 54 95 0 0 0
## B4 23 49 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1119 0 0 0 0
## B2 317 0 0 0 0
## B3 149 0 0 0 0
## B4 72 0 0 0 0
## B5 10 0 0 0 0
## - Fold2: cp=0
## + Fold3: cp=0
## B1 B2 B3 B4 B5
## B1 998 86 23 11 0
## B2 170 99 36 12 0
## B3 78 48 15 8 0
## B4 29 24 13 6 0
## B5 2 3 0 5 0
## B1 B2 B3 B4 B5
## B1 1036 82 0 0 0
## B2 193 124 0 0 0
## B3 74 75 0 0 0
## B4 25 47 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1036 82 0 0 0
## B2 193 124 0 0 0
## B3 74 75 0 0 0
## B4 25 47 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1036 82 0 0 0
## B2 193 124 0 0 0
## B3 74 75 0 0 0
## B4 25 47 0 0 0
## B5 4 6 0 0 0
## B1 B2 B3 B4 B5
## B1 1036 82 0 0 0
## B2 193 124 0 0 0
## B3 74 75 0 0 0
## B4 25 47 0 0 0
## B5 4 6 0 0 0
## - Fold3: cp=0
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.02 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.0486618 0 1.0000000
## 2 0.0200000 2 0.9026764
##
## Variable importance
## reimbursement2008 bucket2008 ihd diabetes
## 31 20 14 13
## heart.failure kidney
## 12 10
##
## Node number 1: 5000 observations, complexity param=0.0486618
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3025 obs) right son=3 (1975 obs)
## Primary splits:
## reimbursement2008 < 1545 to the left, improve=464.1939, (0 missing)
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
## diabetes < 0.5 to the left, improve=323.2704, (0 missing)
## ihd < 0.5 to the left, improve=319.3596, (0 missing)
## heart.failure < 0.5 to the left, improve=227.0499, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.860, adj=0.646, (0 split)
## ihd < 0.5 to the left, agree=0.790, adj=0.467, (0 split)
## diabetes < 0.5 to the left, agree=0.784, adj=0.454, (0 split)
## heart.failure < 0.5 to the left, agree=0.764, adj=0.403, (0 split)
## kidney < 0.5 to the left, agree=0.730, adj=0.316, (0 split)
##
## Node number 2: 3025 observations
## predicted class=B1 expected loss=0.1209917 P(node) =0.605
## class counts: 2659 228 98 37 3
## probabilities: 0.879 0.075 0.032 0.012 0.001
##
## Node number 3: 1975 observations, complexity param=0.0486618
## predicted class=B2 expected loss=0.6339241 P(node) =0.395
## class counts: 697 723 349 180 26
## probabilities: 0.353 0.366 0.177 0.091 0.013
## left son=6 (923 obs) right son=7 (1052 obs)
## Primary splits:
## reimbursement2008 < 3765 to the left, improve=38.75805, (0 missing)
## bucket2008 < 1.5 to the left, improve=30.80855, (0 missing)
## kidney < 0.5 to the left, improve=28.52915, (0 missing)
## ihd < 0.5 to the left, improve=27.61120, (0 missing)
## diabetes < 0.5 to the left, improve=23.99424, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.887, adj=0.757, (0 split)
## kidney < 0.5 to the left, agree=0.660, adj=0.272, (0 split)
## heart.failure < 0.5 to the left, agree=0.639, adj=0.228, (0 split)
## copd < 0.5 to the left, agree=0.622, adj=0.191, (0 split)
## ihd < 0.5 to the left, agree=0.607, adj=0.159, (0 split)
##
## Node number 6: 923 observations
## predicted class=B1 expected loss=0.5178765 P(node) =0.1846
## class counts: 445 311 109 50 8
## probabilities: 0.482 0.337 0.118 0.054 0.009
##
## Node number 7: 1052 observations
## predicted class=B2 expected loss=0.608365 P(node) =0.2104
## class counts: 252 412 240 130 18
## probabilities: 0.240 0.392 0.228 0.124 0.017
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) reimbursement2008< 1545 3025 366 B1 (0.88 0.075 0.032 0.012 0.00099) *
## 3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)
## 6) reimbursement2008< 3765 923 478 B1 (0.48 0.34 0.12 0.054 0.0087) *
## 7) reimbursement2008>=3765 1052 640 B2 (0.24 0.39 0.23 0.12 0.017) *
# Simplify a model
# fit_df <- glb_entity_df; glb_mdl <- step(<complex>_mdl)
rownames(glb_models_df) <- seq(1:nrow(glb_models_df))
plot_models_df <- mutate(glb_models_df, feats.label=substr(feats, 1, 30))
if (glb_is_regression) {
print(orderBy(~ -R.sq.OOB -Adj.R.sq.fit, glb_models_df))
stop("glb_sel_mdl not selected")
print(myplot_scatter(plot_models_df, "Adj.R.sq.fit", "R.sq.OOB") +
geom_text(aes(label=feats.label), data=plot_models_df, color="NavyBlue",
size=3.5, angle=45))
}
if (glb_is_classification) {
# Lower AIC is better
sel_frmla <- formula(paste("~ ", paste0("-auc.OOB", "-accuracy.fit", "+accuracySD.fit")))
print(tmp_models_df <- orderBy(sel_frmla, glb_models_df))
# print(tmp_models_sav_df <- orderBy(~ -auc.OOB -accuracy.fit +accuracySD.fit, glb_models_df))
print("Best model using metrics:")
# print(sprintf("Best model using metrics:", sel_frmla))
# print(summary(bst_mdl <- glb_models_lst[[as.numeric(rownames(tmp_models_df)[1])]]))
myprint_mdl(bst_mdl <- glb_models_lst[[as.numeric(rownames(tmp_models_df)[1])]])
if (!is.null(glb_sel_mdl)) { # Model is user selected
print("User selected model: ")
myprint_mdl(glb_sel_mdl)
} else glb_sel_mdl <- bst_mdl
# glb_sel_mdl <- glb_models_lst[[as.numeric(rownames(tmp_models_df)[1])]]
# print(summary(glb_sel_mdl))
# print("Selected model:glb_sel_mdl:")
# glb_sel_mdl <- glb_models_lst[[as.numeric(rownames(tmp_models_df)[1])]]
# print(summary(glb_sel_mdl))
plot_models_df[, "inv.AIC.fit"] <- 1.0 / plot_models_df[, "AIC.fit"]
if (any(!is.na(plot_models_df$inv.AIC.fit)) | any(!is.na(plot_models_df$auc.OOB))) {
print(myplot_scatter(plot_models_df, "inv.AIC.fit", "auc.OOB") +
geom_text(aes(label=feats.label), data=plot_models_df, color="NavyBlue",
size=3.5, angle=45))
} else warning("All NAs for inv.AIC.fit vs. auc.OOB scatter plot of glb_models_df")
if (any(!is.na(plot_models_df$auc.OOB))) {
print(myplot_scatter(plot_models_df, "auc.OOB", "accuracy.fit",
colorcol_name="method") +
geom_errorbar(aes(x=auc.OOB, ymin=accuracy.fit - accuracySD.fit,
ymax=accuracy.fit + accuracySD.fit),
width=(max(plot_models_df$auc.OOB)-min(plot_models_df$auc.OOB))/25) +
geom_text(aes(label=feats.label), data=plot_models_df, color="NavyBlue",
size=3.5, angle=45))
} else {
warning("All NAs for auc.OOB in auc.OOB vs. accuracy.fit scatter plot of glb_models_df")
print(ggplot(plot_models_df, aes(x=reorder(feats.label, accuracy.fit), y=accuracy.fit,
fill=factor(method))) +
geom_bar(stat="identity", position="dodge") +
geom_errorbar(aes(x=feats.label, ymin=accuracy.fit - accuracySD.fit,
ymax=accuracy.fit + accuracySD.fit, color=factor(method)),
width=0.2) +
theme(axis.text.x = element_text(angle = 45,vjust = 1)))
}
# mdl$times plot across models
print(myplot_scatter(plot_models_df, "inv.elapsedtime.everything", "inv.elapsedtime.final",
colorcol_name="method") +
geom_point(aes(size=accuracy.fit)) +
geom_text(aes(label=feats.label), data=plot_models_df, color="NavyBlue",
size=3.5, angle=45) +
geom_smooth(method="lm"))
}
## method
## 5 rpart
## 4 rpart
## 2 rpart
## 3 rpart
## 1 rpart
## 6 rpart
## 7 rpart
## feats
## 5 age, alzheimers, arthritis, cancer, copd, depression, diabetes, heart.failure, ihd, kidney, osteoporosis, stroke, reimbursement2008, bucket2008
## 4 bucket2008, diabetes, ihd, kidney, heart.failure, copd, depression, alzheimers, arthritis, osteoporosis, cancer, stroke, age
## 2 bucket2008
## 3 bucket2008, reimbursement2008
## 1 .rnorm
## 6 age, alzheimers, arthritis, cancer, copd, depression, diabetes, heart.failure, ihd, kidney, osteoporosis, stroke, reimbursement2008, bucket2008
## 7 age, alzheimers, arthritis, cancer, copd, depression, diabetes, heart.failure, ihd, kidney, osteoporosis, stroke, reimbursement2008, bucket2008
## n.fit inv.elapsedtime.everything inv.elapsedtime.final R.sq.fit R.sq.OOB
## 5 5000 0.5903188 4.255319 NA NA
## 4 5000 0.5485464 4.484305 NA NA
## 2 5000 0.9680542 13.698630 NA NA
## 3 5000 0.8680556 10.204082 NA NA
## 1 5000 0.6493506 9.523810 NA NA
## 6 5000 0.4555809 4.201681 NA NA
## 7 5000 0.5099439 4.273504 NA NA
## Adj.R.sq.fit SSE.fit SSE.OOB AIC.fit auc.fit auc.OOB accuracy.fit
## 5 NA 0 NA NA NA NA 0.7044028
## 4 NA 0 NA NA NA NA 0.7040027
## 2 NA 0 NA NA NA NA 0.7025995
## 3 NA 0 NA NA NA NA 0.6950006
## 1 NA 0 NA NA NA NA 0.6714002
## 6 NA 0 NA NA NA NA NA
## 7 NA 0 NA NA NA NA NA
## accuracySD.fit
## 5 0.0122012896
## 4 0.0125838480
## 2 0.0063929795
## 3 0.0082416065
## 1 0.0007592686
## 6 NA
## 7 NA
## [1] "Best model using metrics:"
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.04866180 0 1.0000000
## 2 0.01094891 2 0.9026764
## 3 0.01000000 3 0.8917275
##
## Variable importance
## reimbursement2008 bucket2008 ihd diabetes
## 31 20 14 13
## heart.failure kidney
## 12 10
##
## Node number 1: 5000 observations, complexity param=0.0486618
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3025 obs) right son=3 (1975 obs)
## Primary splits:
## reimbursement2008 < 1545 to the left, improve=464.1939, (0 missing)
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
## diabetes < 0.5 to the left, improve=323.2704, (0 missing)
## ihd < 0.5 to the left, improve=319.3596, (0 missing)
## heart.failure < 0.5 to the left, improve=227.0499, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.860, adj=0.646, (0 split)
## ihd < 0.5 to the left, agree=0.790, adj=0.467, (0 split)
## diabetes < 0.5 to the left, agree=0.784, adj=0.454, (0 split)
## heart.failure < 0.5 to the left, agree=0.764, adj=0.403, (0 split)
## kidney < 0.5 to the left, agree=0.730, adj=0.316, (0 split)
##
## Node number 2: 3025 observations
## predicted class=B1 expected loss=0.1209917 P(node) =0.605
## class counts: 2659 228 98 37 3
## probabilities: 0.879 0.075 0.032 0.012 0.001
##
## Node number 3: 1975 observations, complexity param=0.0486618
## predicted class=B2 expected loss=0.6339241 P(node) =0.395
## class counts: 697 723 349 180 26
## probabilities: 0.353 0.366 0.177 0.091 0.013
## left son=6 (923 obs) right son=7 (1052 obs)
## Primary splits:
## reimbursement2008 < 3765 to the left, improve=38.75805, (0 missing)
## bucket2008 < 1.5 to the left, improve=30.80855, (0 missing)
## kidney < 0.5 to the left, improve=28.52915, (0 missing)
## ihd < 0.5 to the left, improve=27.61120, (0 missing)
## diabetes < 0.5 to the left, improve=23.99424, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.887, adj=0.757, (0 split)
## kidney < 0.5 to the left, agree=0.660, adj=0.272, (0 split)
## heart.failure < 0.5 to the left, agree=0.639, adj=0.228, (0 split)
## copd < 0.5 to the left, agree=0.622, adj=0.191, (0 split)
## ihd < 0.5 to the left, agree=0.607, adj=0.159, (0 split)
##
## Node number 6: 923 observations, complexity param=0.01094891
## predicted class=B1 expected loss=0.5178765 P(node) =0.1846
## class counts: 445 311 109 50 8
## probabilities: 0.482 0.337 0.118 0.054 0.009
## left son=12 (754 obs) right son=13 (169 obs)
## Primary splits:
## kidney < 0.5 to the left, improve=7.902130, (0 missing)
## diabetes < 0.5 to the left, improve=6.451528, (0 missing)
## ihd < 0.5 to the left, improve=6.445818, (0 missing)
## reimbursement2008 < 2545 to the left, improve=5.585770, (0 missing)
## arthritis < 0.5 to the left, improve=5.444392, (0 missing)
## Surrogate splits:
## reimbursement2008 < 3755 to the left, agree=0.818, adj=0.006, (0 split)
##
## Node number 7: 1052 observations
## predicted class=B2 expected loss=0.608365 P(node) =0.2104
## class counts: 252 412 240 130 18
## probabilities: 0.240 0.392 0.228 0.124 0.017
##
## Node number 12: 754 observations
## predicted class=B1 expected loss=0.4814324 P(node) =0.1508
## class counts: 391 239 79 38 7
## probabilities: 0.519 0.317 0.105 0.050 0.009
##
## Node number 13: 169 observations
## predicted class=B2 expected loss=0.5739645 P(node) =0.0338
## class counts: 54 72 30 12 1
## probabilities: 0.320 0.426 0.178 0.071 0.006
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) reimbursement2008< 1545 3025 366 B1 (0.88 0.075 0.032 0.012 0.00099) *
## 3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)
## 6) reimbursement2008< 3765 923 478 B1 (0.48 0.34 0.12 0.054 0.0087)
## 12) kidney< 0.5 754 363 B1 (0.52 0.32 0.1 0.05 0.0093) *
## 13) kidney>=0.5 169 97 B2 (0.32 0.43 0.18 0.071 0.0059) *
## 7) reimbursement2008>=3765 1052 640 B2 (0.24 0.39 0.23 0.12 0.017) *
## [1] "User selected model: "
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.0486618 0 1.0000000
## 2 0.0200000 2 0.9026764
##
## Variable importance
## reimbursement2008 bucket2008 ihd diabetes
## 31 20 14 13
## heart.failure kidney
## 12 10
##
## Node number 1: 5000 observations, complexity param=0.0486618
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3025 obs) right son=3 (1975 obs)
## Primary splits:
## reimbursement2008 < 1545 to the left, improve=464.1939, (0 missing)
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
## diabetes < 0.5 to the left, improve=323.2704, (0 missing)
## ihd < 0.5 to the left, improve=319.3596, (0 missing)
## heart.failure < 0.5 to the left, improve=227.0499, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.860, adj=0.646, (0 split)
## ihd < 0.5 to the left, agree=0.790, adj=0.467, (0 split)
## diabetes < 0.5 to the left, agree=0.784, adj=0.454, (0 split)
## heart.failure < 0.5 to the left, agree=0.764, adj=0.403, (0 split)
## kidney < 0.5 to the left, agree=0.730, adj=0.316, (0 split)
##
## Node number 2: 3025 observations
## predicted class=B1 expected loss=0.1209917 P(node) =0.605
## class counts: 2659 228 98 37 3
## probabilities: 0.879 0.075 0.032 0.012 0.001
##
## Node number 3: 1975 observations, complexity param=0.0486618
## predicted class=B2 expected loss=0.6339241 P(node) =0.395
## class counts: 697 723 349 180 26
## probabilities: 0.353 0.366 0.177 0.091 0.013
## left son=6 (923 obs) right son=7 (1052 obs)
## Primary splits:
## reimbursement2008 < 3765 to the left, improve=38.75805, (0 missing)
## bucket2008 < 1.5 to the left, improve=30.80855, (0 missing)
## kidney < 0.5 to the left, improve=28.52915, (0 missing)
## ihd < 0.5 to the left, improve=27.61120, (0 missing)
## diabetes < 0.5 to the left, improve=23.99424, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.887, adj=0.757, (0 split)
## kidney < 0.5 to the left, agree=0.660, adj=0.272, (0 split)
## heart.failure < 0.5 to the left, agree=0.639, adj=0.228, (0 split)
## copd < 0.5 to the left, agree=0.622, adj=0.191, (0 split)
## ihd < 0.5 to the left, agree=0.607, adj=0.159, (0 split)
##
## Node number 6: 923 observations
## predicted class=B1 expected loss=0.5178765 P(node) =0.1846
## class counts: 445 311 109 50 8
## probabilities: 0.482 0.337 0.118 0.054 0.009
##
## Node number 7: 1052 observations
## predicted class=B2 expected loss=0.608365 P(node) =0.2104
## class counts: 252 412 240 130 18
## probabilities: 0.240 0.392 0.228 0.124 0.017
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) reimbursement2008< 1545 3025 366 B1 (0.88 0.075 0.032 0.012 0.00099) *
## 3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)
## 6) reimbursement2008< 3765 923 478 B1 (0.48 0.34 0.12 0.054 0.0087) *
## 7) reimbursement2008>=3765 1052 640 B2 (0.24 0.39 0.23 0.12 0.017) *
## Warning: All NAs for inv.AIC.fit vs. auc.OOB scatter plot of glb_models_df
## Warning: All NAs for auc.OOB in auc.OOB vs. accuracy.fit scatter plot of
## glb_models_df
## Warning: Removed 2 rows containing missing values (geom_point).
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="fit.data.training.all",
chunk_step_major=max(glb_script_df$chunk_step_major)+1,
chunk_step_minor=0,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed8 run.models 5 0 13.903
## elapsed9 fit.data.training.all 6 0 35.325
6: fit.data.training.allprint(mdl_feats_df <- myextract_mdl_feats( sel_mdl=glb_sel_mdl,
entity_df=glb_entity_df))
## importance id fit.feat
## reimbursement2008 100.000000 reimbursement2008 TRUE
## bucket2008 80.804319 bucket2008 TRUE
## diabetes 69.045297 diabetes TRUE
## ihd 68.986875 ihd TRUE
## heart.failure 45.143466 heart.failure TRUE
## kidney 5.672342 kidney TRUE
## age 0.000000 age TRUE
## alzheimers 0.000000 alzheimers TRUE
## arthritis 0.000000 arthritis TRUE
## cancer 0.000000 cancer TRUE
## copd 0.000000 copd TRUE
## depression 0.000000 depression TRUE
## osteoporosis 0.000000 osteoporosis TRUE
## stroke 0.000000 stroke TRUE
# ret_lst <- myrun_mdl_fn(indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var,
# rsp_var_out=glb_rsp_var_out,
# fit_df=glb_entity_df,
# OOB_df=glb_newent_df,
# method=method,
# tune_models_df=glb_tune_models_df,
# n_cv_folds=glb_n_cv_folds,
# loss_mtrx=glb_loss_mtrx,
# summaryFunction=glb_loss_smmry,
# metric="loss.error",
# maximize=FALSE)
ret_lst <- myrun_mdl_fn(indep_vars_vctr=mdl_feats_df$id,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
fit_df=glb_entity_df,
method=glb_sel_mdl$method,
tune_models_df=glb_tune_models_df,
n_cv_folds=2, # glb_n_cv_folds,
loss_mtrx=glb_loss_mtrx,
summaryFunction=glb_sel_mdl$control$summaryFunction,
metric=glb_sel_mdl$metric,
maximize=glb_sel_mdl$maximize)
## B1 B2 B3 B4 B5
## B1 6 2 0 0 0
## B2 2 0 0 0 0
## B3 0 0 0 0 0
## B4 0 0 0 0 0
## B5 0 0 0 0 0
## + Fold1: cp=0
## B1 B2 B3 B4 B5
## B1 1458 170 38 12 0
## B2 221 170 57 27 0
## B3 96 79 34 15 0
## B4 43 37 15 14 0
## B5 5 3 4 2 0
## B1 B2 B3 B4 B5
## B1 1504 174 0 0 0
## B2 214 261 0 0 0
## B3 88 136 0 0 0
## B4 44 65 0 0 0
## B5 4 10 0 0 0
## B1 B2 B3 B4 B5
## B1 1423 255 0 0 0
## B2 172 303 0 0 0
## B3 71 153 0 0 0
## B4 37 72 0 0 0
## B5 4 10 0 0 0
## B1 B2 B3 B4 B5
## B1 1423 255 0 0 0
## B2 172 303 0 0 0
## B3 71 153 0 0 0
## B4 37 72 0 0 0
## B5 4 10 0 0 0
## B1 B2 B3 B4 B5
## B1 1423 255 0 0 0
## B2 172 303 0 0 0
## B3 71 153 0 0 0
## B4 37 72 0 0 0
## B5 4 10 0 0 0
## - Fold1: cp=0
## + Fold2: cp=0
## B1 B2 B3 B4 B5
## B1 1483 146 35 14 0
## B2 257 154 51 14 0
## B3 104 92 20 7 0
## B4 39 45 16 8 0
## B5 6 7 2 0 0
## B1 B2 B3 B4 B5
## B1 1573 105 0 0 0
## B2 306 170 0 0 0
## B3 108 115 0 0 0
## B4 43 65 0 0 0
## B5 6 9 0 0 0
## B1 B2 B3 B4 B5
## B1 1595 83 0 0 0
## B2 350 126 0 0 0
## B3 132 91 0 0 0
## B4 49 59 0 0 0
## B5 8 7 0 0 0
## B1 B2 B3 B4 B5
## B1 1595 83 0 0 0
## B2 350 126 0 0 0
## B3 132 91 0 0 0
## B4 49 59 0 0 0
## B5 8 7 0 0 0
## B1 B2 B3 B4 B5
## B1 1595 83 0 0 0
## B2 350 126 0 0 0
## B3 132 91 0 0 0
## B4 49 59 0 0 0
## B5 8 7 0 0 0
## - Fold2: cp=0
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.01 on full training set
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 5000
##
## CP nsplit rel error
## 1 0.04866180 0 1.0000000
## 2 0.01094891 2 0.9026764
## 3 0.01000000 3 0.8917275
##
## Variable importance
## reimbursement2008 bucket2008 ihd diabetes
## 31 20 14 13
## heart.failure kidney
## 12 10
##
## Node number 1: 5000 observations, complexity param=0.0486618
## predicted class=B1 expected loss=0.3288 P(node) =1
## class counts: 3356 951 447 217 29
## probabilities: 0.671 0.190 0.089 0.043 0.006
## left son=2 (3025 obs) right son=3 (1975 obs)
## Primary splits:
## reimbursement2008 < 1545 to the left, improve=464.1939, (0 missing)
## bucket2008 < 1.5 to the left, improve=375.5983, (0 missing)
## diabetes < 0.5 to the left, improve=323.2704, (0 missing)
## ihd < 0.5 to the left, improve=319.3596, (0 missing)
## heart.failure < 0.5 to the left, improve=227.0499, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.860, adj=0.646, (0 split)
## ihd < 0.5 to the left, agree=0.790, adj=0.467, (0 split)
## diabetes < 0.5 to the left, agree=0.784, adj=0.454, (0 split)
## heart.failure < 0.5 to the left, agree=0.764, adj=0.403, (0 split)
## kidney < 0.5 to the left, agree=0.730, adj=0.316, (0 split)
##
## Node number 2: 3025 observations
## predicted class=B1 expected loss=0.1209917 P(node) =0.605
## class counts: 2659 228 98 37 3
## probabilities: 0.879 0.075 0.032 0.012 0.001
##
## Node number 3: 1975 observations, complexity param=0.0486618
## predicted class=B2 expected loss=0.6339241 P(node) =0.395
## class counts: 697 723 349 180 26
## probabilities: 0.353 0.366 0.177 0.091 0.013
## left son=6 (923 obs) right son=7 (1052 obs)
## Primary splits:
## reimbursement2008 < 3765 to the left, improve=38.75805, (0 missing)
## bucket2008 < 1.5 to the left, improve=30.80855, (0 missing)
## kidney < 0.5 to the left, improve=28.52915, (0 missing)
## ihd < 0.5 to the left, improve=27.61120, (0 missing)
## diabetes < 0.5 to the left, improve=23.99424, (0 missing)
## Surrogate splits:
## bucket2008 < 1.5 to the left, agree=0.887, adj=0.757, (0 split)
## kidney < 0.5 to the left, agree=0.660, adj=0.272, (0 split)
## heart.failure < 0.5 to the left, agree=0.639, adj=0.228, (0 split)
## copd < 0.5 to the left, agree=0.622, adj=0.191, (0 split)
## ihd < 0.5 to the left, agree=0.607, adj=0.159, (0 split)
##
## Node number 6: 923 observations, complexity param=0.01094891
## predicted class=B1 expected loss=0.5178765 P(node) =0.1846
## class counts: 445 311 109 50 8
## probabilities: 0.482 0.337 0.118 0.054 0.009
## left son=12 (754 obs) right son=13 (169 obs)
## Primary splits:
## kidney < 0.5 to the left, improve=7.902130, (0 missing)
## diabetes < 0.5 to the left, improve=6.451528, (0 missing)
## ihd < 0.5 to the left, improve=6.445818, (0 missing)
## reimbursement2008 < 2545 to the left, improve=5.585770, (0 missing)
## arthritis < 0.5 to the left, improve=5.444392, (0 missing)
## Surrogate splits:
## reimbursement2008 < 3755 to the left, agree=0.818, adj=0.006, (0 split)
##
## Node number 7: 1052 observations
## predicted class=B2 expected loss=0.608365 P(node) =0.2104
## class counts: 252 412 240 130 18
## probabilities: 0.240 0.392 0.228 0.124 0.017
##
## Node number 12: 754 observations
## predicted class=B1 expected loss=0.4814324 P(node) =0.1508
## class counts: 391 239 79 38 7
## probabilities: 0.519 0.317 0.105 0.050 0.009
##
## Node number 13: 169 observations
## predicted class=B2 expected loss=0.5739645 P(node) =0.0338
## class counts: 54 72 30 12 1
## probabilities: 0.320 0.426 0.178 0.071 0.006
##
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)
## 2) reimbursement2008< 1545 3025 366 B1 (0.88 0.075 0.032 0.012 0.00099) *
## 3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)
## 6) reimbursement2008< 3765 923 478 B1 (0.48 0.34 0.12 0.054 0.0087)
## 12) kidney< 0.5 754 363 B1 (0.52 0.32 0.1 0.05 0.0093) *
## 13) kidney>=0.5 169 97 B2 (0.32 0.43 0.18 0.071 0.0059) *
## 7) reimbursement2008>=3765 1052 640 B2 (0.24 0.39 0.23 0.12 0.017) *
glb_fin_mdl <- ret_lst[["model"]];
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="fit.data.training.all",
chunk_step_major=glb_script_df[nrow(glb_script_df), "chunk_step_major"],
chunk_step_minor=glb_script_df[nrow(glb_script_df), "chunk_step_minor"]+1,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed9 fit.data.training.all 6 0 35.325
## elapsed10 fit.data.training.all 6 1 40.509
if (glb_is_regression) {
glb_entity_df[, glb_rsp_var_out] <- predict(glb_fin_mdl, newdata=glb_entity_df)
print(myplot_scatter(glb_entity_df, glb_rsp_var, glb_rsp_var_out,
smooth=TRUE))
glb_entity_df[, paste0(glb_rsp_var_out, ".err")] <-
abs(glb_entity_df[, glb_rsp_var_out] - glb_entity_df[, glb_rsp_var])
print(head(orderBy(reformulate(c("-", paste0(glb_rsp_var_out, ".err"))),
glb_entity_df)))
}
if (glb_is_classification & glb_is_binomial) {
if (any(class(glb_fin_mdl) %in% c("train"))) {
glb_entity_df[, paste0(glb_rsp_var_out, ".proba")] <-
predict(glb_fin_mdl, newdata=glb_entity_df, type="prob")[, 2]
} else if (any(class(glb_fin_mdl) %in% c("rpart", "randomForest"))) {
glb_entity_df[, paste0(glb_rsp_var_out, ".proba")] <-
predict(glb_fin_mdl, newdata=glb_entity_df, type="prob")[, 2]
} else if (class(glb_fin_mdl) == "glm") {
stop("not implemented yet")
glb_entity_df[, paste0(glb_rsp_var_out, ".proba")] <-
predict(glb_fin_mdl, newdata=glb_entity_df, type="response")
} else stop("not implemented yet")
require(ROCR)
ROCRpred <- prediction(glb_entity_df[, paste0(glb_rsp_var_out, ".proba")],
glb_entity_df[, glb_rsp_var])
ROCRperf <- performance(ROCRpred, "tpr", "fpr")
plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0, 1, 0.1), text.adj=c(-0.2,1.7))
thresholds_df <- data.frame(threshold=seq(0.0, 1.0, 0.1))
thresholds_df$f.score <- sapply(1:nrow(thresholds_df), function(row_ix)
mycompute_classifier_f.score(mdl=glb_fin_mdl, obs_df=glb_entity_df,
proba_threshold=thresholds_df[row_ix, "threshold"],
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out))
print(thresholds_df)
print(myplot_line(thresholds_df, "threshold", "f.score"))
proba_threshold <- thresholds_df[which.max(thresholds_df$f.score),
"threshold"]
# This should change to maximize f.score.OOB ???
print(sprintf("Classifier Probability Threshold: %0.4f to maximize f.score.fit",
proba_threshold))
if (is.null(glb_clf_proba_threshold))
glb_clf_proba_threshold <- proba_threshold else {
print(sprintf("Classifier Probability Threshold: %0.4f per user specs",
glb_clf_proba_threshold))
}
if ((class(glb_entity_df[, glb_rsp_var]) != "factor") |
(length(levels(glb_entity_df[, glb_rsp_var])) != 2))
stop("expecting a factor with two levels:", glb_rsp_var)
glb_entity_df[, glb_rsp_var_out] <-
factor(levels(glb_entity_df[, glb_rsp_var])[
(glb_entity_df[, paste0(glb_rsp_var_out, ".proba")] >=
glb_clf_proba_threshold) * 1 + 1])
print(mycreate_xtab(glb_entity_df, c(glb_rsp_var, glb_rsp_var_out)))
print(sprintf("f.score=%0.4f",
mycompute_classifier_f.score(glb_fin_mdl, glb_entity_df,
glb_clf_proba_threshold,
glb_rsp_var, glb_rsp_var_out)))
}
if (glb_is_classification & !glb_is_binomial) {
glb_entity_df[, glb_rsp_var_out] <- predict(glb_fin_mdl, newdata=glb_entity_df, type="raw")
}
print(glb_feats_df <- mymerge_feats_importance(glb_feats_df, glb_fin_mdl, glb_entity_df))
## id cor.y cor.y.abs cor.low importance
## 13 reimbursement2008 0.40427780 0.40427780 NA 100.000000
## 4 bucket2008 0.46125446 0.46125446 1 79.916766
## 8 diabetes 0.40896269 0.40896269 1 69.555548
## 10 ihd 0.40798592 0.40798592 1 69.496645
## 9 heart.failure 0.37320420 0.37320420 1 44.647610
## 11 kidney 0.38971036 0.38971036 1 7.163930
## 3 arthritis 0.25224332 0.25224332 1 1.070597
## 1 age 0.02529723 0.02529723 1 0.000000
## 2 alzheimers 0.27882540 0.27882540 1 0.000000
## 5 cancer 0.19499896 0.19499896 1 0.000000
## 6 copd 0.31969314 0.31969314 1 0.000000
## 7 depression 0.28668189 0.28668189 1 0.000000
## 12 osteoporosis 0.21770017 0.21770017 1 0.000000
## 14 stroke 0.16603039 0.16603039 1 0.000000
# Most of this code is used again in predict.data.new chunk
glb_analytics_diag_plots <- function(obs_df) {
for (var in subset(glb_feats_df, !is.na(importance))$id) {
plot_df <- melt(obs_df, id.vars=var,
measure.vars=c(glb_rsp_var, glb_rsp_var_out))
# if (var == "<feat_name>") print(myplot_scatter(plot_df, var, "value",
# facet_colcol_name="variable") +
# geom_vline(xintercept=<divider_val>, linetype="dotted")) else
print(myplot_scatter(plot_df, var, "value", colorcol_name="variable",
facet_colcol_name="variable", jitter=TRUE) +
guides(color=FALSE))
}
if (glb_is_regression) {
plot_vars_df <- subset(glb_feats_df, Pr.z < 0.1)
print(myplot_prediction_regression(obs_df,
ifelse(nrow(plot_vars_df) > 1, plot_vars_df$id[2], ".rownames"),
plot_vars_df$id[1],
glb_rsp_var, glb_rsp_var_out)
# + facet_wrap(reformulate(plot_vars_df$id[2])) # if [1,2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (nrow(plot_vars_df <- subset(glb_feats_df, !is.na(importance))) == 0)
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df=obs_df,
feat_x=ifelse(nrow(plot_vars_df) > 1, plot_vars_df$id[2],
".rownames"),
feat_y=plot_vars_df$id[1],
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
id_vars=glb_id_vars)
# + geom_hline(yintercept=<divider_val>, linetype = "dotted")
)
}
}
glb_analytics_diag_plots(obs_df=glb_entity_df)
## age alzheimers arthritis cancer copd depression diabetes
## 199 73 0 0 0 0 0 0
## 153558 68 0 0 0 0 0 0
## 154439 70 0 0 0 1 0 1
## 162273 59 0 0 0 0 0 1
## 179045 74 1 0 0 0 0 1
## 190763 57 1 0 0 0 0 0
## 190868 67 1 0 0 0 0 0
## 192357 72 0 0 0 0 0 1
## 214932 80 1 0 1 0 1 1
## 262260 79 1 1 1 0 0 1
## 300375 88 1 0 0 0 0 1
## 307445 74 0 0 0 0 0 0
## 312790 75 0 0 0 0 0 0
## 313477 82 0 0 0 0 0 0
## 321503 86 0 0 0 0 0 0
## 338073 78 0 0 0 0 0 0
## 349461 74 0 0 0 0 0 0
## 398550 56 1 0 0 0 0 1
## 402417 53 1 0 0 0 1 1
## 456020 68 1 1 0 1 1 1
## heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 199 0 0 0 0 0 0
## 153558 0 0 1 0 0 1790
## 154439 0 1 1 0 0 58230
## 162273 1 1 1 0 0 1820
## 179045 1 1 1 0 1 2360
## 190763 1 0 1 1 0 1580
## 190868 1 0 1 0 0 2590
## 192357 1 1 1 0 0 1640
## 214932 1 1 1 0 0 71660
## 262260 0 1 1 0 1 112980
## 300375 1 1 1 0 1 68350
## 307445 0 0 0 0 0 0
## 312790 0 0 0 0 0 0
## 313477 0 0 0 0 0 0
## 321503 0 0 0 0 0 0
## 338073 0 0 0 0 0 0
## 349461 0 0 0 0 0 0
## 398550 1 1 1 1 1 64830
## 402417 1 1 0 0 0 65590
## 456020 1 1 1 1 0 133330
## bucket2008 reimbursement2009 bucket2009 .rnorm bucket2009.fctr
## 199 1 0 1 0.09472856 B1
## 153558 1 560 1 0.74713704 B1
## 154439 5 570 1 0.11459397 B1
## 162273 1 670 1 2.30203690 B1
## 179045 1 880 1 -0.45292024 B1
## 190763 1 1030 1 -0.98121958 B1
## 190868 1 1030 1 0.60288110 B1
## 192357 1 1050 1 0.70214341 B1
## 214932 5 1340 1 -1.76578293 B1
## 262260 5 2040 1 0.62732336 B1
## 300375 5 2810 1 -0.19715402 B1
## 307445 1 3000 2 0.55120317 B2
## 312790 1 3150 2 -0.37917847 B2
## 313477 1 3170 2 -0.15857725 B2
## 321503 1 3410 2 0.43252159 B2
## 338073 1 4000 2 -0.81670317 B2
## 349461 1 4490 2 0.80808499 B2
## 398550 5 8570 3 -1.16511891 B3
## 402417 5 9200 3 -2.13025718 B3
## 456020 5 59270 5 -2.06323014 B5
## bucket2009.fctr.prediction bucket2009.fctr.prediction.accurate
## 199 B1 TRUE
## 153558 B2 FALSE
## 154439 B2 FALSE
## 162273 B2 FALSE
## 179045 B2 FALSE
## 190763 B2 FALSE
## 190868 B2 FALSE
## 192357 B2 FALSE
## 214932 B2 FALSE
## 262260 B2 FALSE
## 300375 B2 FALSE
## 307445 B1 FALSE
## 312790 B1 FALSE
## 313477 B1 FALSE
## 321503 B1 FALSE
## 338073 B1 FALSE
## 349461 B1 FALSE
## 398550 B2 FALSE
## 402417 B2 FALSE
## 456020 B2 FALSE
## .label
## 199 .199
## 153558 .153558
## 154439 .154439
## 162273 .162273
## 179045 .179045
## 190763 .190763
## 190868 .190868
## 192357 .192357
## 214932 .214932
## 262260 .262260
## 300375 .300375
## 307445 .307445
## 312790 .312790
## 313477 .313477
## 321503 .321503
## 338073 .338073
## 349461 .349461
## 398550 .398550
## 402417 .402417
## 456020 .456020
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_script_df <- rbind(glb_script_df,
data.frame(chunk_label="predict.data.new",
chunk_step_major=max(glb_script_df$chunk_step_major)+1,
chunk_step_minor=0,
elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
## chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed10 fit.data.training.all 6 1 40.509
## elapsed11 predict.data.new 7 0 58.094
7: predict data.newif (glb_is_regression)
glb_newent_df[, glb_rsp_var_out] <- predict(glb_fin_mdl,
newdata=glb_newent_df, type="response")
if (glb_is_classification & glb_is_binomial) {
# Compute selected model predictions
if (any(class(glb_fin_mdl) %in% c("train"))) {
glb_newent_df[, paste0(glb_rsp_var_out, ".proba")] <-
predict(glb_fin_mdl, newdata=glb_newent_df, type="prob")[, 2]
} else if (any(class(glb_fin_mdl) %in% c("rpart", "randomForest"))) {
glb_newent_df[, paste0(glb_rsp_var_out, ".proba")] <-
predict(glb_fin_mdl, newdata=glb_newent_df, type="prob")[, 2]
} else if (class(glb_fin_mdl) == "glm") {
stop("not implemented yet")
glb_newent_df[, paste0(glb_rsp_var_out, ".proba")] <-
predict(glb_fin_mdl, newdata=glb_newent_df, type="response")
} else stop("not implemented yet")
if ((class(glb_newent_df[, glb_rsp_var]) != "factor") |
(length(levels(glb_newent_df[, glb_rsp_var])) != 2))
stop("expecting a factor with two levels:", glb_rsp_var)
glb_newent_df[, glb_rsp_var_out] <-
factor(levels(glb_newent_df[, glb_rsp_var])[
(glb_newent_df[, paste0(glb_rsp_var_out, ".proba")] >=
glb_clf_proba_threshold) * 1 + 1])
# Compute dummy model predictions
glb_newent_df[, paste0(glb_rsp_var, ".predictdmy.proba")] <-
predict(glb_dmy_mdl, newdata=glb_newent_df, type="prob")[, 2]
if ((class(glb_newent_df[, glb_rsp_var]) != "factor") |
(length(levels(glb_newent_df[, glb_rsp_var])) != 2))
stop("expecting a factor with two levels:", glb_rsp_var)
glb_newent_df[, paste0(glb_rsp_var, ".predictdmy")] <-
factor(levels(glb_newent_df[, glb_rsp_var])[
(glb_newent_df[, paste0(glb_rsp_var, ".predictdmy.proba")] >=
glb_clf_proba_threshold) * 1 + 1])
}
if (glb_is_classification & !glb_is_binomial) {
# Compute baseline predictions
if (!is.null(glb_bsl_mdl_var)) {
if (is.null(glb_map_rsp_raw_to_var)) {
glb_newent_df[, paste0(glb_rsp_var, ".predictbsl")] <-
glb_newent_df[, glb_bsl_mdl_var]
} else {
glb_newent_df[, paste0(glb_rsp_var, ".predictbsl")] <-
glb_map_rsp_raw_to_var(glb_newent_df[, glb_bsl_mdl_var])
}
}
# Compute most frequent outcome predictions
glb_newent_df[, paste0(glb_rsp_var, ".predictmfo")] <-
as.factor(names(sort(table(glb_entity_df[, glb_rsp_var]), decreasing=TRUE))[1])
# Compute dummy model predictions - different from most frequent outcome - shd be from runif
glb_newent_df[, paste0(glb_rsp_var, ".predictdmy")] <-
predict(glb_dmy_mdl, newdata=glb_newent_df, type="raw")
# Compute selected_no_loss_matrix model predictions
glb_newent_df[, paste0(glb_rsp_var, ".predictnlm")] <-
predict(glb_sel_nlm_mdl, newdata=glb_newent_df, type="raw")
# Compute selected_with_loss_matrix model predictions
glb_newent_df[, paste0(glb_rsp_var, ".predictwlm")] <-
predict(glb_sel_wlm_mdl, newdata=glb_newent_df, type="raw")
# Compute selected_with_t_loss_matrix model predictions
glb_newent_df[, paste0(glb_rsp_var, ".predictsel")] <-
predict(glb_sel_mdl, newdata=glb_newent_df, type="raw")
# Compute final model predictions
glb_newent_df[, glb_rsp_var_out] <- predict(glb_fin_mdl, newdata=glb_newent_df, type="raw")
}
myprint_df(glb_newent_df[, c(glb_id_vars, glb_rsp_var, glb_rsp_var_out)])
## bucket2009.fctr bucket2009.fctr.prediction
## 6 B1 B1
## 26 B1 B1
## 95 B1 B1
## 121 B1 B1
## 197 B1 B1
## 286 B1 B1
## bucket2009.fctr bucket2009.fctr.prediction
## 141700 B1 B1
## 240147 B1 B1
## 254198 B1 B2
## 272537 B1 B1
## 429148 B3 B1
## 451493 B4 B2
## bucket2009.fctr bucket2009.fctr.prediction
## 457490 B5 B2
## 457576 B5 B2
## 457604 B5 B2
## 457705 B5 B2
## 457824 B5 B2
## 457912 B5 B2
if (glb_is_regression) {
print(sprintf("Total SSE: %0.4f",
sum((glb_newent_df[, glb_rsp_var_out] -
glb_newent_df[, glb_rsp_var]) ^ 2)))
print(sprintf("RMSE: %0.4f",
(sum((glb_newent_df[, glb_rsp_var_out] -
glb_newent_df[, glb_rsp_var]) ^ 2) / nrow(glb_newent_df)) ^ 0.5))
print(myplot_scatter(glb_newent_df, glb_rsp_var, glb_rsp_var_out,
smooth=TRUE))
glb_newent_df[, paste0(glb_rsp_var_out, ".err")] <-
abs(glb_newent_df[, glb_rsp_var_out] - glb_newent_df[, glb_rsp_var])
print(head(orderBy(reformulate(c("-", paste0(glb_rsp_var_out, ".err"))),
glb_newent_df)))
# glb_newent_df[, "<Output Pred variable>"] <- func(glb_newent_df[, glb_pred_var_name])
}
if (glb_is_classification & glb_is_binomial) {
ROCRpred <- prediction(glb_newent_df[, paste0(glb_rsp_var_out, ".proba")],
glb_newent_df[, glb_rsp_var])
print(sprintf("auc=%0.4f", auc <- as.numeric(performance(ROCRpred, "auc")@y.values)))
print(sprintf("probability threshold=%0.4f", glb_clf_proba_threshold))
print(newent_conf_df <- mycreate_xtab(glb_newent_df,
c(glb_rsp_var, glb_rsp_var_out)))
print(sprintf("f.score.sel=%0.4f",
mycompute_classifier_f.score(mdl=glb_fin_mdl, obs_df=glb_newent_df,
proba_threshold=glb_clf_proba_threshold,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out)))
print(sprintf("sensitivity=%0.4f", newent_conf_df[2, 3] /
(newent_conf_df[2, 3] + newent_conf_df[2, 2])))
print(sprintf("specificity=%0.4f", newent_conf_df[1, 2] /
(newent_conf_df[1, 2] + newent_conf_df[1, 3])))
print(sprintf("accuracy=%0.4f", (newent_conf_df[1, 2] + newent_conf_df[2, 3]) /
(newent_conf_df[1, 2] + newent_conf_df[2, 3] +
newent_conf_df[1, 3] + newent_conf_df[2, 2])))
print(mycreate_xtab(glb_newent_df, c(glb_rsp_var, paste0(glb_rsp_var, ".predictdmy"))))
print(sprintf("f.score.dmy=%0.4f",
mycompute_classifier_f.score(mdl=glb_dmy_mdl, obs_df=glb_newent_df,
proba_threshold=glb_clf_proba_threshold,
rsp_var=glb_rsp_var,
rsp_var_out=paste0(glb_rsp_var, ".predictdmy"))))
}
if (glb_is_classification & !glb_is_binomial) {
mycompute_prediction_quality <- function(mdl_type) {
print(sprintf("prediction model type: %s", mdl_type))
print(newent_conf_df <- mycreate_xtab(glb_newent_df,
c(glb_rsp_var,
ifelse(mdl_type == "fin", glb_rsp_var_out,
paste0(glb_rsp_var, ".predict", mdl_type)))))
newent_conf_df[is.na(newent_conf_df)] <- 0
newent_conf_mtrx <- as.matrix(newent_conf_df[, -1])
newent_conf_mtrx <- cbind(newent_conf_mtrx,
matrix(rep(0,
(nrow(newent_conf_mtrx) - ncol(newent_conf_mtrx)) * nrow(newent_conf_mtrx)),
byrow=TRUE, nrow=nrow(newent_conf_mtrx)))
return(data.frame(mdl_type=mdl_type,
accuracy=sum(diag(newent_conf_mtrx)) * 1.0 / sum(newent_conf_mtrx),
loss.error=sum(newent_conf_mtrx * glb_loss_mtrx) / nrow(glb_newent_df)))
}
predict_qlty_df <- data.frame()
for (type in c("bsl", "mfo", "dmy", "nlm", "wlm", "sel", "fin"))
predict_qlty_df <- rbind(predict_qlty_df, mycompute_prediction_quality(type))
print(predict_qlty_df)
predict_qlty_df$inv.loss.error <- 1.0 / predict_qlty_df$loss.error
print(myplot_scatter(predict_qlty_df, "accuracy", "inv.loss.error",
colorcol_name="mdl_type"))
}
## [1] "prediction model type: bsl"
## bucket2009.fctr bucket2009.fctr.predictbsl.B1
## 1 B1 2993
## 2 B2 430
## 3 B3 198
## 4 B4 78
## 5 B5 10
## bucket2009.fctr.predictbsl.B2 bucket2009.fctr.predictbsl.B3
## 1 216 102
## 2 292 124
## 3 126 68
## 4 51 39
## 5 5 5
## bucket2009.fctr.predictbsl.B4 bucket2009.fctr.predictbsl.B5
## 1 39 6
## 2 88 17
## 3 48 7
## 4 39 10
## 5 6 3
## [1] "prediction model type: mfo"
## bucket2009.fctr bucket2009.fctr.predictmfo.B1
## 1 B1 3356
## 2 B2 951
## 3 B3 447
## 4 B4 217
## 5 B5 29
## [1] "prediction model type: dmy"
## bucket2009.fctr bucket2009.fctr.predictdmy.B1
## 1 B1 3356
## 2 B2 951
## 3 B3 447
## 4 B4 217
## 5 B5 29
## [1] "prediction model type: nlm"
## bucket2009.fctr bucket2009.fctr.predictnlm.B1
## 1 B1 3011
## 2 B2 455
## 3 B3 195
## 4 B4 80
## 5 B5 11
## bucket2009.fctr.predictnlm.B2
## 1 345
## 2 496
## 3 252
## 4 137
## 5 18
## [1] "prediction model type: wlm"
## bucket2009.fctr bucket2009.fctr.predictwlm.B1
## 1 B1 3011
## 2 B2 455
## 3 B3 195
## 4 B4 80
## 5 B5 11
## bucket2009.fctr.predictwlm.B2
## 1 345
## 2 496
## 3 252
## 4 137
## 5 18
## [1] "prediction model type: sel"
## bucket2009.fctr bucket2009.fctr.predictsel.B1
## 1 B1 3076
## 2 B2 524
## 3 B3 230
## 4 B4 89
## 5 B5 12
## bucket2009.fctr.predictsel.B2
## 1 280
## 2 427
## 3 217
## 4 128
## 5 17
## [1] "prediction model type: fin"
## bucket2009.fctr bucket2009.fctr.prediction.B1
## 1 B1 3011
## 2 B2 455
## 3 B3 195
## 4 B4 80
## 5 B5 11
## bucket2009.fctr.prediction.B2
## 1 345
## 2 496
## 3 252
## 4 137
## 5 18
## mdl_type accuracy loss.error
## 1 bsl 0.6790 0.7560
## 2 mfo 0.6712 1.0448
## 3 dmy 0.6712 1.0448
## 4 nlm 0.7014 0.7526
## 5 wlm 0.7014 0.7526
## 6 sel 0.7006 0.7852
## 7 fin 0.7014 0.7526
glb_analytics_diag_plots(obs_df=glb_newent_df)
## age alzheimers arthritis cancer copd depression diabetes
## 6 68 0 0 0 0 0 0
## 124097 76 0 0 0 0 0 0
## 129688 73 0 1 0 0 0 1
## 159234 90 0 0 0 1 0 1
## 169528 74 0 0 0 0 0 1
## 174385 82 0 1 1 0 0 1
## 182164 67 0 0 0 0 0 0
## 183017 67 0 1 0 0 0 0
## 183925 82 1 0 0 1 1 1
## 191662 80 0 0 0 0 0 1
## 225906 67 1 1 0 1 1 1
## 276769 85 0 0 1 1 1 0
## 288549 89 1 0 1 0 0 1
## 307826 53 0 0 0 0 0 0
## 309958 97 0 0 0 0 0 0
## 312065 68 0 0 0 0 0 0
## 315618 53 0 0 0 0 0 0
## 322157 45 0 0 0 0 0 0
## 335790 72 0 0 0 0 0 0
## 380565 73 1 1 1 1 0 1
## heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 6 0 0 0 0 0 0
## 124097 1 0 1 0 0 1960
## 129688 0 1 1 0 0 2920
## 159234 1 1 1 0 0 73860
## 169528 0 1 1 0 0 2350
## 174385 1 1 1 1 0 73650
## 182164 0 1 1 0 0 1690
## 183017 0 1 1 0 0 2480
## 183925 1 1 1 0 0 84040
## 191662 1 1 1 1 0 2700
## 225906 1 1 1 1 1 98110
## 276769 1 1 1 0 0 86040
## 288549 1 1 1 0 0 64380
## 307826 0 0 0 0 0 0
## 309958 0 0 0 0 0 0
## 312065 0 0 0 0 0 0
## 315618 0 0 0 0 0 0
## 322157 0 0 0 0 0 0
## 335790 0 0 0 0 0 0
## 380565 1 1 1 0 1 173320
## bucket2008 reimbursement2009 bucket2009 .rnorm bucket2009.fctr
## 6 1 0 1 -0.2112782 B1
## 124097 1 200 1 1.1792599 B1
## 129688 1 260 1 0.4691106 B1
## 159234 5 630 1 0.3867920 B1
## 169528 1 760 1 0.7510352 B1
## 174385 5 820 1 -0.9283519 B1
## 182164 1 920 1 0.4947145 B1
## 183017 1 930 1 -0.3375198 B1
## 183925 5 940 1 0.1999707 B1
## 191662 1 1040 1 1.3930878 B1
## 225906 5 1490 1 -1.1052827 B1
## 276769 5 2300 1 0.7084125 B1
## 288549 5 2540 1 1.5000372 B1
## 307826 1 3010 2 0.4305946 B2
## 309958 1 3070 2 1.6863455 B2
## 312065 1 3130 2 -0.6355543 B2
## 315618 1 3230 2 -0.7178728 B2
## 322157 1 3430 2 0.3348709 B2
## 335790 1 3910 2 -0.0957258 B2
## 380565 5 6470 2 -0.7323969 B2
## bucket2009.fctr.predictbsl bucket2009.fctr.predictmfo
## 6 B1 B1
## 124097 B1 B1
## 129688 B1 B1
## 159234 B5 B1
## 169528 B1 B1
## 174385 B5 B1
## 182164 B1 B1
## 183017 B1 B1
## 183925 B5 B1
## 191662 B1 B1
## 225906 B5 B1
## 276769 B5 B1
## 288549 B5 B1
## 307826 B1 B1
## 309958 B1 B1
## 312065 B1 B1
## 315618 B1 B1
## 322157 B1 B1
## 335790 B1 B1
## 380565 B5 B1
## bucket2009.fctr.predictdmy bucket2009.fctr.predictnlm
## 6 B1 B1
## 124097 B1 B2
## 129688 B1 B2
## 159234 B1 B2
## 169528 B1 B2
## 174385 B1 B2
## 182164 B1 B2
## 183017 B1 B2
## 183925 B1 B2
## 191662 B1 B2
## 225906 B1 B2
## 276769 B1 B2
## 288549 B1 B2
## 307826 B1 B1
## 309958 B1 B1
## 312065 B1 B1
## 315618 B1 B1
## 322157 B1 B1
## 335790 B1 B1
## 380565 B1 B2
## bucket2009.fctr.predictwlm bucket2009.fctr.predictsel
## 6 B1 B1
## 124097 B2 B1
## 129688 B2 B1
## 159234 B2 B2
## 169528 B2 B1
## 174385 B2 B2
## 182164 B2 B1
## 183017 B2 B1
## 183925 B2 B2
## 191662 B2 B1
## 225906 B2 B2
## 276769 B2 B2
## 288549 B2 B2
## 307826 B1 B1
## 309958 B1 B1
## 312065 B1 B1
## 315618 B1 B1
## 322157 B1 B1
## 335790 B1 B1
## 380565 B2 B2
## bucket2009.fctr.prediction bucket2009.fctr.prediction.accurate
## 6 B1 TRUE
## 124097 B2 FALSE
## 129688 B2 FALSE
## 159234 B2 FALSE
## 169528 B2 FALSE
## 174385 B2 FALSE
## 182164 B2 FALSE
## 183017 B2 FALSE
## 183925 B2 FALSE
## 191662 B2 FALSE
## 225906 B2 FALSE
## 276769 B2 FALSE
## 288549 B2 FALSE
## 307826 B1 FALSE
## 309958 B1 FALSE
## 312065 B1 FALSE
## 315618 B1 FALSE
## 322157 B1 FALSE
## 335790 B1 FALSE
## 380565 B2 TRUE
## .label
## 6 .6
## 124097 .124097
## 129688 .129688
## 159234 .159234
## 169528 .169528
## 174385 .174385
## 182164 .182164
## 183017 .183017
## 183925 .183925
## 191662 .191662
## 225906 .225906
## 276769 .276769
## 288549 .288549
## 307826 .307826
## 309958 .309958
## 312065 .312065
## 315618 .315618
## 322157 .322157
## 335790 .335790
## 380565 .380565
tmp_replay_lst <- replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.new.prediction")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
## 6.0000 6 0 0 1 2
#print(ggplot.petrinet(tmp_replay_lst[["pn"]]) + coord_flip())
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent.
# print(t.test(subset(cars_df, am_fctr == "automatic")$mpg,
# subset(cars_df, am_fctr == "manual")$mpg,
# var.equal=FALSE)$conf)
We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## chunk_label chunk_step_major chunk_step_minor elapsed
## 10 fit.data.training.all 6 0 35.325
## 12 predict.data.new 7 0 58.094
## 2 cleanse_data 2 0 6.270
## 11 fit.data.training.all 6 1 40.509
## 6 extract_features 3 0 10.942
## 4 manage_missing_data 2 2 7.808
## 9 run.models 5 0 13.903
## 7 select_features 4 0 12.299
## 5 encode_retype_data 2 3 8.228
## 8 remove_correlated_features 4 1 12.506
## 3 inspectORexplore.data 2 1 6.301
## 1 import_data 1 0 0.002
## elapsed_diff
## 10 21.422
## 12 17.585
## 2 6.268
## 11 5.184
## 6 2.714
## 4 1.507
## 9 1.397
## 7 1.357
## 5 0.420
## 8 0.207
## 3 0.031
## 1 0.000
## [1] "Total Elapsed Time: 58.094 secs"
## R version 3.1.3 (2015-03-09)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.2 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] tcltk grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] rpart.plot_1.5.2 rpart_4.1-9 caret_6.0-41 lattice_0.20-31
## [5] ROCR_1.0-7 gplots_2.16.0 reshape2_1.4.1 sqldf_0.4-10
## [9] RSQLite_1.0.0 DBI_0.3.1 gsubfn_0.6-6 proto_0.3-10
## [13] plyr_1.8.1 caTools_1.17.1 doBy_4.5-13 survival_2.38-1
## [17] ggplot2_1.0.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 BradleyTerry2_1.0-6 brglm_0.5-9
## [4] car_2.0-25 chron_2.3-45 class_7.3-12
## [7] codetools_0.2-11 colorspace_1.2-6 compiler_3.1.3
## [10] digest_0.6.8 e1071_1.6-4 evaluate_0.5.5
## [13] foreach_1.4.2 formatR_1.1 gdata_2.13.3
## [16] gtable_0.1.2 gtools_3.4.1 htmltools_0.2.6
## [19] iterators_1.0.7 KernSmooth_2.23-14 knitr_1.9
## [22] labeling_0.3 lme4_1.1-7 MASS_7.3-40
## [25] Matrix_1.2-0 mgcv_1.8-6 minqa_1.2.4
## [28] munsell_0.4.2 nlme_3.1-120 nloptr_1.0.4
## [31] nnet_7.3-9 parallel_3.1.3 pbkrtest_0.4-2
## [34] quantreg_5.11 Rcpp_0.11.5 rmarkdown_0.5.1
## [37] scales_0.2.4 SparseM_1.6 splines_3.1.3
## [40] stringr_0.6.2 tools_3.1.3 yaml_2.1.13